Exploratory Data Analysis (EDA)"

```{whites, eval=FALSE, echo = eval_rows}

Drop empty rows

rowsums <- data.frame(sapply(df,is.na)) rows_drop <- (which(rowSums(rowsums) == ncol(df))) df <- df[-rows_drop, ,drop=FALSE]

```r
eval_num <- FALSE
eval_numcol <- FALSE
if (exists("df_num")){
  if (ncol(df_num)>0){
     indices <- which(colnames(df_code) %in% colnames(df_num))
     cat("\\# `Column names of selected continuous variables`")
     cat("\\newline ")
     cat("`colnames_continuous = ")
     cat(paste0("c(", paste(indices, collapse=','), ")`"))
     eval_num <- TRUE
     eval_numcol <- (ncol(df_num)>1)
  }
}

```{whites, eval=FALSE, echo = eval_num}

Data frame of the continuous variables

df_num <- df[ ,colnames_continuous, drop=FALSE]

```r
eval_cat <- FALSE
eval_catcol <- FALSE
if (exists("df_factor")){
  if (ncol(df_factor)>0){
     indices <- which(colnames(df_code) %in% colnames(df_factor))
     cat("\\# `Column names of selected categorical variables`")
     cat("\\newline ")
     cat("`colnames_categorical = ")
     cat(paste0("c(", paste(indices, collapse=','), ")`"))
     eval_cat <- TRUE
     eval_catcol <- (ncol(df_factor)>1)
  }
}

```{whites, eval=FALSE, echo = eval_cat}

Data frame of the categorical variables

df_factor <- df[ ,colnames_categorical, drop=FALSE]

```{whites, eval=FALSE, echo = eval_num}

# Continuous variables
## Descriptive statistics

### Take over summary from psych package and add new stats 
stats_new <- psych::describe(df_num)

### Drop some stats which we do not need  
stats_new <- as.data.frame(stats_new)
stats_new <- stats_new[c(-1,-6,-10,-13)]

### Add new stats
stats_new$Variable <- colnames(df_num)
stats_new$ntotal <- nrow(df_num)

### Missings
stats_new$miss <- sapply(df_num, function(col) sum(is.na(col)))

### Complete rate
stats_new$complete <- sapply(df_num, function(col) (1-(sum(is.na(col)) / nrow(df_num)))*100)

### N Unique
stats_new$N_Unique <- sapply(df_num, function(col) length(unique(na.omit(col))))

### CV
stats_new$CV <- sapply(df_num, function(col) {
  ifelse(any(col <= 0, na.rm=TRUE), "-", round((sd(col, na.rm=TRUE) / mean(col, na.rm=TRUE)),2))
  })

### Reorder columns 
stats_new <- stats_new[,c(10,11,12,1,13,14,2:9,15)]

### Column names 
colnames(stats_new) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete", "N Unique", "Mean", 
                         "SD", "Median", "MAD", "MIN", "MAX", "Skewness", "Kurtosis", "CV")

### Order by variable name
stats_new <- stats_new[order(stats_new$Variable),]

### Output
knitr::kable(stats_new, digits=2, row.names = FALSE, format="simple") 

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Descriptive graphics: Histograms One Per Page

Order by variable name

df_num_order <- df_num[,order(colnames(df_num)),drop=FALSE]

Function to plot histogram for each variable

single_hist <- function(x, main = "Histogram", ylab="Relative Frequency", xlab=NULL, freq=FALSE, bcol="#2fa42d", dcol=c("#396e9f","#396e9f"), dlty=c("dotted", "solid"), breaks=21) {

h <- hist(x, plot=FALSE, breaks=breaks) m <- mean(x, na.rm=TRUE) s <- sd(x, na.rm=TRUE) d <- density(x, na.rm=TRUE)

# Set nice x and y axis limits xlims <- pretty(c(floor(h$breaks[1]),ceiling(last(h$breaks)))) ymax <- max(h$density) dmax <- max(d$y) ymax <- max(ymax,dmax)

# Plots plot(h, freq=freq, ylim=c(0, ymax*1.2), ylab=ylab, xlab=xlab, main=main, col=bcol, xlim = c(min(xlims), max(xlims))) lines(d, lty=dlty[1], col=dcol[1]) curve(dnorm(x,m,s), add=TRUE, lty=dlty[2], col=dcol[2])

}

Loop over variables

for (i in 1:ncol(df_num)){ single_hist(df_num_order[,i], main = paste("Histogram of ", colnames(df_num_order[i]))) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Descriptive graphics: Histograms Summary
k <- ceiling(ncol(df_num)/20)-1
for (i in 0:k){
  m <- 20*i+1
  n <- min(20*(i+1),ncol(df_num))
  multi.hist(df_num_order[,m:n], dcol=c("#396e9f","#396e9f"), 
             bcol= "#2fa42d", 
             dlty=c("dotted", "solid"), 
             main = colnames(df_num_order[,m:n])) 
}

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Descriptive graphics: Box-Plot One Per Page

Loop over variables

for (i in 1:ncol(df_num)){ boxplot(df_num_order[,c(i)], col = "#2fa42d", main = paste("Boxplot of",colnames(df_num_order[i])), xlab=paste(colnames(df_num_order[i])), horizontal = TRUE) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Descriptive graphics: Box-Plots Summary

### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), 
    mar=c(1.5,1,2,1), oma=c(1,1,1,1))

### Loop over variables   
for(i in 1:ncol(df_num)){
  boxplot(df_num_order[,c(i)], col = "#2fa42d", main = colnames(df_num_order[i]), 
          xlab=paste(colnames(df_num_order[i])), xaxt="n", horizontal = TRUE)
} 

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Descriptive graphics: ECDF Plots One Per Page

Loop over variables

for (i in 1:ncol(df_num)){

data <- as.data.frame(df_num_order[,c(i)]) colnames(data) <- "variable"

# Plot ECDF step_function <- ecdf(data$variable) plot(step_function, main=paste("ECDF Plot of", colnames(df_num_order[i])), xlab=colnames(df_num_order[i]), ylab="ECDF", cex=0.7, col="#2fa42d", do.points=TRUE)

# Plot CDF of normal distribution data_mean<- mean(data$variable, na.rm=TRUE) data_sd<- sd(data$variable, na.rm=TRUE) curve(pnorm(x, data_mean,data_sd), from=qnorm(0.0001, mean=data_mean, sd=data_sd), to=qnorm(0.9999, mean=data_mean, sd=data_sd), add=TRUE, col="#396e9f", lwd=2) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Graphics: ECDF Plots Summary

### ECDF function 
ecdf_plot <- function(i){

  data <- as.data.frame(df_num_order[,c(i)])
  colnames(data)<-"variable"

  # Plot ECDF
  step_function <- ecdf(data$variable)
  ecdf_plot <- plot(step_function,
                  main = colnames(df_num_order[i]),
                  xlab = colnames(df_num_order[i]), ylab = "ECDF",
                  cex = 0.7, col="#2fa42d", do.points = FALSE)

  # Plot CDF of normal distribution
  data_mean <- mean(data$variable, na.rm=TRUE)
  data_sd <- sd(data$variable, na.rm=TRUE)
  curve(pnorm(x, data_mean,data_sd),
        from = qnorm(0.0001, mean = data_mean, sd = data_sd), 
        to = qnorm(0.9999, mean = data_mean, sd = data_sd), 
        add = TRUE, col="#396e9f", lwd=0.5,pch=1)
}


### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), 
    mar=c(1.5,1,2,1), oma=c(1,1,1,1))

### Loop over variables  
for(i in 1:ncol(df_num)) ecdf_plot(i)

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_num}

Continuous variables

Graphics: QQ Plots One Per Page

Define function for the QQ-Plot

qq_plot <- function(i, main, xlab, ylab){ var <- df_num_order[,i] qqplot(x = qnorm(ppoints(var), mean = mean(var, na.rm = TRUE), sd = sd(var, na.rm = TRUE)), y = var, xlim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), ylim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), main = main, xlab = xlab, ylab = ylab, col = "#2fa42d", cex=0.7, pch=19 ) abline(a = 0, b = 1, col = "#396e9f", lwd = 2) grid() }

Loop over variables

for (i in 1:ncol(df_num)){ qq_plot(i, main = paste("QQ-Plot of", colnames(df_num_order[i])), xlab = "Theoretical Quantiles, Normal Distribution", ylab = paste("Sample Quantiles for ", colnames(df_num_order[i])) ) }

```{whites, eval=FALSE, echo = eval_numcol}
# Continuous variables
## Graphics: QQ Plots Summary
### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_num_order))), 
            ceiling(sqrt(length(df_num_order)))), 
    mar=c(1.5,1,2,1), oma=c(1,1,1,1))

### Loop over variables   
for(i in 1:ncol(df_num)){
  qq_plot(i, colnames(df_num_order[i]), "", "")
}

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_cat}

Categorical variables

Descriptive statistics: Totals

Totals statistics

miss <- sapply(df_factor, function(col) sum(is.na(col))) complete <- sapply(df_factor, function(col) (1-(sum(is.na(col)) / nrow(df_factor)))*100) complete <- round(complete,3) totals <- data.frame(miss, complete) totals$Variable <- rownames(totals) totals$ntotal <- nrow(df_factor) totals$valid <- totals$ntotal - totals$miss totals$N_Unique <- sapply(df_factor, function(col) length(unique(col))) totals <- totals[,c(3,4,1,5,2,6)] totals <- totals[order(totals$Variable),] colnames(totals) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete","N Unique")

Output

kable(totals, digits=2, row.names = FALSE, format="simple")

```{whites, eval=FALSE, echo = eval_cat}
# Categorical variables 
## Descriptive statistics: Frequencies

### Function stats per variable
discrete <- function(i){

  # Calculate individual statistics
  count <- table(df_factor[,i], useNA="always")
  perc <- as.data.frame(prop.table(count))
  perc$Percent <- perc$Freq*100
  perc$Freq <- NULL

  # Merge to one dataframe
  freq <- merge(count, perc, by="Var1")
  freq$Variable <- rep(colnames(df_factor)[i],nrow(freq))
  freq <- freq[,c(4,1,2,3)]
  colnames(freq) <- c("Variable", "Category","Frequency", "Percent")

  # Rename missing category
  if(length(is.na(freq$Category))>0){
    levels(freq$Category) <- c(levels(freq$Category),"Missing")
    freq$Category[is.na(freq$Category)] <- "Missing"
  }

  # Sort 
  freq_order <- freq[order(-freq[,4],freq[,2]),]

  # Add category "All other values" in case of more than 20 categories
  min <- min(20, length(unique(df_factor[,i])))
  if(min==20){
    freq_order$Category <- as.character(freq_order$Category)
    freq_order <- rbind(freq_order[1:20,], 
                        c(colnames(df_factor)[i], as.character("****All Other Values****"),
                          sum(freq_order$Frequency[-c(1:20)]), sum(freq_order$Percent[-c(1:20)])))
  } else {
    freq_order <- freq_order[1:min,]
  }
  return(freq_order)
}

### Loop over variables
cat_table <- discrete(1)
for (i in 1:ncol(df_factor)){
  if (i>1){
    cat_i <- discrete(i)
    cat_table <- rbind(cat_table, cat_i)
  }
}

### Sort by variable name 
cat_table <- cat_table[order(cat_table$Variable),]
cat_table$Percent <- round(as.numeric(cat_table$Percent),2)

### Output  
kable(cat_table, digits=2, row.names = FALSE, format="simple") 

```{whites, eval=FALSE, echo = eval_cat}

Categorical variables

Descriptive graphics: Bar-Plots One Per Page

Data frame sorted by column name

df_factor_order <- df_factor[,order(colnames(df_factor)), drop=FALSE]

Loop over variables

for (i in 1:ncol(df_factor)){ counts <- table(df_factor_order[i], useNA = "ifany") names(counts)[is.na(names(counts))] <- "Missing" counts <- counts[order(counts)]

# Plot by case (e.g. category names length) if (any(nchar(names(counts), type = "chars") >= 11) || length(counts) > 12){

 if(length(counts) > 40){
    # Bar-Plot with suppressed category names 
    par(mar = c(6,6,4.1, 2.1), mgp = c(3, 1, 0))
    barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), xaxt="n",
            ylab = "Frequency", cex.names = 0.6, las = 2, xlab = colnames(df_factor_order[i]),
            ylim = range(pretty(c(0,counts))))


  } else {
 # Bar-Plot with shortened category names          
   par(mar = c(8, 8, 4.1, 2.1), mgp = c(6, 1, 0))
   names(counts) <- substr(names(counts), 1, 15)
   barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", 
           cex.names = 0.65, xlab= colnames(df_factor_order[i]), las=2, ylim=range(pretty(c(0,counts))))

  }

} else { # Bar-Plot with full-length names par(mar = c(6,6, 4.1, 2.1), mgp = c(5, 1, 0)) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", cex.names = 0.7, ylim=range(pretty(c(0,counts))), xlab= colnames(df_factor_order[i])) } }

```{whites, eval=FALSE, echo = eval_catcol}
# Categorical variables 
## Descriptive graphics: Bar-Plots Summary

### Function for Bar-Plot per variable 
plot_bar <- function(i){
  counts <- table(df_factor_order[i], useNA = "ifany")
  names(counts)[is.na(names(counts))] <- "Missing"
  names(counts)[names(counts)=="NA"] <- "Missing"
  counts <- counts[order(counts)]
  barplot(counts, col = "#2fa42d", main = colnames(df_factor_order[i]),
          ylab = "Frequency",  xaxt="n", ylim=range(pretty(c(0,counts))))
}

### Set graphical parameters 
par(mfrow=c(ceiling(sqrt(length(df_factor))), ceiling(sqrt(length(df_factor)))), 
    mar=c(1.5,1,2,1), oma= c(1,1,1,1))

### Loop over variables 
for(i in 1:ncol(df_factor)) plot_bar(i)

### Restore original graphical settings 
par(opar)

```{whites, eval=FALSE, echo = eval_cat}

Categorical variables

Descriptive graphics: Pie-Plots One Per Page

Function to create frequency table for each variable

freqtable <- function(col){

# Replace NA with "Missing" col[is.na(col)] <- "Missing"

# Create table with frequencies pie_table_unsorted <- as.data.frame(table(col)) pie_table_sorted <- pie_table_unsorted[order(pie_table_unsorted$Freq, decreasing=TRUE),] colnames(pie_table_sorted) <- c("Category", "Frequency")

# If more than 20 categories: summarize the smallest categories to one category if (nrow(pie_table_sorted)>20){ pie_table_sorted$Category <- as.character(pie_table_sorted$Category) pie_table_summarized <- rbind(pie_table_sorted[c(1:20),], c(as.character("All Other Values"), sum(pie_table_sorted$Frequency[-c(1:20)]))) pie_table_sorted <- pie_table_summarized } pie_table_sorted$RelFreq <- as.numeric(pie_table_sorted$Frequency) / length(col) return(pie_table_sorted) }

Plot function

plot_pie <- function(table, title, title_size, legend_pos){ # Direction of the legend if (max(nchar(as.character(table[,1])))>15){ legend = "vertical" } else { legend = "horizontal" } plot <- ggplot(table, aes(x = "", y = RelFreq, fill = reorder(Category, RelFreq))) + guides(fill = guide_legend(title="", reverse = FALSE, direction = legend)) + ggtitle(title) + geom_col() + geom_text(aes(label = scales::percent(RelFreq,accuracy = 0.01)),
position = position_stack(vjust = 0.5), size = 2) + coord_polar("y", start = 0) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank(), axis.text = element_blank(), legend.position = legend_pos, panel.background = element_blank(), plot.title = title_size) return(plot) }

Loop over variables

for(i in 1:ncol(df_factor)){ table <- freqtable(df_factor_order[,i]) title <- paste("Pie Chart of ", colnames(df_factor_order[i])) title_size <- element_text(hjust = 0.5, face = "bold") legend_pos <- "right" print(plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos)) }

```{whites, eval=FALSE, echo = eval_catcol}
# Categorical variables 
## Descriptive graphics: Pie-Plots Summary

### Save variable plots in a list
plots <- list()
for (i in 1:ncol(df_factor)){
  title <- substr(colnames(df_factor_order[i]), 1, 19)
  title_size <- element_text(size = 30 / min(25, ceiling(sqrt(ncol(df_factor)))),
                             face = "bold", hjust = 0.5)
  legend_pos <- "none"
  table <- freqtable(df_factor_order[,i])
  plots[[i]] <- plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos)
}


### Summary Plot
grid.arrange(grobs = plots, ncol = ceiling(sqrt(ncol(df_factor_order))))


Try the Statsomat package in your browser

Any scripts or data that you put into this service are public.

Statsomat documentation built on Nov. 17, 2021, 5:17 p.m.