```{whites, eval=FALSE, echo = eval_rows}
rowsums <- data.frame(sapply(df,is.na)) rows_drop <- (which(rowSums(rowsums) == ncol(df))) df <- df[-rows_drop, ,drop=FALSE]
```r eval_num <- FALSE eval_numcol <- FALSE if (exists("df_num")){ if (ncol(df_num)>0){ indices <- which(colnames(df_code) %in% colnames(df_num)) cat("\\# `Column names of selected continuous variables`") cat("\\newline ") cat("`colnames_continuous = ") cat(paste0("c(", paste(indices, collapse=','), ")`")) eval_num <- TRUE eval_numcol <- (ncol(df_num)>1) } }
```{whites, eval=FALSE, echo = eval_num}
df_num <- df[ ,colnames_continuous, drop=FALSE]
```r eval_cat <- FALSE eval_catcol <- FALSE if (exists("df_factor")){ if (ncol(df_factor)>0){ indices <- which(colnames(df_code) %in% colnames(df_factor)) cat("\\# `Column names of selected categorical variables`") cat("\\newline ") cat("`colnames_categorical = ") cat(paste0("c(", paste(indices, collapse=','), ")`")) eval_cat <- TRUE eval_catcol <- (ncol(df_factor)>1) } }
```{whites, eval=FALSE, echo = eval_cat}
df_factor <- df[ ,colnames_categorical, drop=FALSE]
```{whites, eval=FALSE, echo = eval_num} # Continuous variables ## Descriptive statistics ### Take over summary from psych package and add new stats stats_new <- psych::describe(df_num) ### Drop some stats which we do not need stats_new <- as.data.frame(stats_new) stats_new <- stats_new[c(-1,-6,-10,-13)] ### Add new stats stats_new$Variable <- colnames(df_num) stats_new$ntotal <- nrow(df_num) ### Missings stats_new$miss <- sapply(df_num, function(col) sum(is.na(col))) ### Complete rate stats_new$complete <- sapply(df_num, function(col) (1-(sum(is.na(col)) / nrow(df_num)))*100) ### N Unique stats_new$N_Unique <- sapply(df_num, function(col) length(unique(na.omit(col)))) ### CV stats_new$CV <- sapply(df_num, function(col) { ifelse(any(col <= 0, na.rm=TRUE), "-", round((sd(col, na.rm=TRUE) / mean(col, na.rm=TRUE)),2)) }) ### Reorder columns stats_new <- stats_new[,c(10,11,12,1,13,14,2:9,15)] ### Column names colnames(stats_new) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete", "N Unique", "Mean", "SD", "Median", "MAD", "MIN", "MAX", "Skewness", "Kurtosis", "CV") ### Order by variable name stats_new <- stats_new[order(stats_new$Variable),] ### Output knitr::kable(stats_new, digits=2, row.names = FALSE, format="simple")
```{whites, eval=FALSE, echo = eval_num}
df_num_order <- df_num[,order(colnames(df_num)),drop=FALSE]
single_hist <- function(x, main = "Histogram", ylab="Relative Frequency", xlab=NULL, freq=FALSE, bcol="#2fa42d", dcol=c("#396e9f","#396e9f"), dlty=c("dotted", "solid"), breaks=21) {
h <- hist(x, plot=FALSE, breaks=breaks) m <- mean(x, na.rm=TRUE) s <- sd(x, na.rm=TRUE) d <- density(x, na.rm=TRUE)
# Set nice x and y axis limits xlims <- pretty(c(floor(h$breaks[1]),ceiling(last(h$breaks)))) ymax <- max(h$density) dmax <- max(d$y) ymax <- max(ymax,dmax)
# Plots plot(h, freq=freq, ylim=c(0, ymax*1.2), ylab=ylab, xlab=xlab, main=main, col=bcol, xlim = c(min(xlims), max(xlims))) lines(d, lty=dlty[1], col=dcol[1]) curve(dnorm(x,m,s), add=TRUE, lty=dlty[2], col=dcol[2])
}
for (i in 1:ncol(df_num)){ single_hist(df_num_order[,i], main = paste("Histogram of ", colnames(df_num_order[i]))) }
```{whites, eval=FALSE, echo = eval_numcol} # Continuous variables ## Descriptive graphics: Histograms Summary k <- ceiling(ncol(df_num)/20)-1 for (i in 0:k){ m <- 20*i+1 n <- min(20*(i+1),ncol(df_num)) multi.hist(df_num_order[,m:n], dcol=c("#396e9f","#396e9f"), bcol= "#2fa42d", dlty=c("dotted", "solid"), main = colnames(df_num_order[,m:n])) }
```{whites, eval=FALSE, echo = eval_num}
for (i in 1:ncol(df_num)){ boxplot(df_num_order[,c(i)], col = "#2fa42d", main = paste("Boxplot of",colnames(df_num_order[i])), xlab=paste(colnames(df_num_order[i])), horizontal = TRUE) }
```{whites, eval=FALSE, echo = eval_numcol} # Continuous variables ## Descriptive graphics: Box-Plots Summary ### Set graphical parameters par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), mar=c(1.5,1,2,1), oma=c(1,1,1,1)) ### Loop over variables for(i in 1:ncol(df_num)){ boxplot(df_num_order[,c(i)], col = "#2fa42d", main = colnames(df_num_order[i]), xlab=paste(colnames(df_num_order[i])), xaxt="n", horizontal = TRUE) } ### Restore original graphical settings par(opar)
```{whites, eval=FALSE, echo = eval_num}
for (i in 1:ncol(df_num)){
data <- as.data.frame(df_num_order[,c(i)]) colnames(data) <- "variable"
# Plot ECDF step_function <- ecdf(data$variable) plot(step_function, main=paste("ECDF Plot of", colnames(df_num_order[i])), xlab=colnames(df_num_order[i]), ylab="ECDF", cex=0.7, col="#2fa42d", do.points=TRUE)
# Plot CDF of normal distribution data_mean<- mean(data$variable, na.rm=TRUE) data_sd<- sd(data$variable, na.rm=TRUE) curve(pnorm(x, data_mean,data_sd), from=qnorm(0.0001, mean=data_mean, sd=data_sd), to=qnorm(0.9999, mean=data_mean, sd=data_sd), add=TRUE, col="#396e9f", lwd=2) }
```{whites, eval=FALSE, echo = eval_numcol} # Continuous variables ## Graphics: ECDF Plots Summary ### ECDF function ecdf_plot <- function(i){ data <- as.data.frame(df_num_order[,c(i)]) colnames(data)<-"variable" # Plot ECDF step_function <- ecdf(data$variable) ecdf_plot <- plot(step_function, main = colnames(df_num_order[i]), xlab = colnames(df_num_order[i]), ylab = "ECDF", cex = 0.7, col="#2fa42d", do.points = FALSE) # Plot CDF of normal distribution data_mean <- mean(data$variable, na.rm=TRUE) data_sd <- sd(data$variable, na.rm=TRUE) curve(pnorm(x, data_mean,data_sd), from = qnorm(0.0001, mean = data_mean, sd = data_sd), to = qnorm(0.9999, mean = data_mean, sd = data_sd), add = TRUE, col="#396e9f", lwd=0.5,pch=1) } ### Set graphical parameters par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), mar=c(1.5,1,2,1), oma=c(1,1,1,1)) ### Loop over variables for(i in 1:ncol(df_num)) ecdf_plot(i) ### Restore original graphical settings par(opar)
```{whites, eval=FALSE, echo = eval_num}
qq_plot <- function(i, main, xlab, ylab){ var <- df_num_order[,i] qqplot(x = qnorm(ppoints(var), mean = mean(var, na.rm = TRUE), sd = sd(var, na.rm = TRUE)), y = var, xlim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), ylim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), main = main, xlab = xlab, ylab = ylab, col = "#2fa42d", cex=0.7, pch=19 ) abline(a = 0, b = 1, col = "#396e9f", lwd = 2) grid() }
for (i in 1:ncol(df_num)){ qq_plot(i, main = paste("QQ-Plot of", colnames(df_num_order[i])), xlab = "Theoretical Quantiles, Normal Distribution", ylab = paste("Sample Quantiles for ", colnames(df_num_order[i])) ) }
```{whites, eval=FALSE, echo = eval_numcol} # Continuous variables ## Graphics: QQ Plots Summary ### Set graphical parameters par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), mar=c(1.5,1,2,1), oma=c(1,1,1,1)) ### Loop over variables for(i in 1:ncol(df_num)){ qq_plot(i, colnames(df_num_order[i]), "", "") } ### Restore original graphical settings par(opar)
```{whites, eval=FALSE, echo = eval_cat}
miss <- sapply(df_factor, function(col) sum(is.na(col))) complete <- sapply(df_factor, function(col) (1-(sum(is.na(col)) / nrow(df_factor)))*100) complete <- round(complete,3) totals <- data.frame(miss, complete) totals$Variable <- rownames(totals) totals$ntotal <- nrow(df_factor) totals$valid <- totals$ntotal - totals$miss totals$N_Unique <- sapply(df_factor, function(col) length(unique(col))) totals <- totals[,c(3,4,1,5,2,6)] totals <- totals[order(totals$Variable),] colnames(totals) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete","N Unique")
kable(totals, digits=2, row.names = FALSE, format="simple")
```{whites, eval=FALSE, echo = eval_cat} # Categorical variables ## Descriptive statistics: Frequencies ### Function stats per variable discrete <- function(i){ # Calculate individual statistics count <- table(df_factor[,i], useNA="always") perc <- as.data.frame(prop.table(count)) perc$Percent <- perc$Freq*100 perc$Freq <- NULL # Merge to one dataframe freq <- merge(count, perc, by="Var1") freq$Variable <- rep(colnames(df_factor)[i],nrow(freq)) freq <- freq[,c(4,1,2,3)] colnames(freq) <- c("Variable", "Category","Frequency", "Percent") # Rename missing category if(length(is.na(freq$Category))>0){ levels(freq$Category) <- c(levels(freq$Category),"Missing") freq$Category[is.na(freq$Category)] <- "Missing" } # Sort freq_order <- freq[order(-freq[,4],freq[,2]),] # Add category "All other values" in case of more than 20 categories min <- min(20, length(unique(df_factor[,i]))) if(min==20){ freq_order$Category <- as.character(freq_order$Category) freq_order <- rbind(freq_order[1:20,], c(colnames(df_factor)[i], as.character("****All Other Values****"), sum(freq_order$Frequency[-c(1:20)]), sum(freq_order$Percent[-c(1:20)]))) } else { freq_order <- freq_order[1:min,] } return(freq_order) } ### Loop over variables cat_table <- discrete(1) for (i in 1:ncol(df_factor)){ if (i>1){ cat_i <- discrete(i) cat_table <- rbind(cat_table, cat_i) } } ### Sort by variable name cat_table <- cat_table[order(cat_table$Variable),] cat_table$Percent <- round(as.numeric(cat_table$Percent),2) ### Output kable(cat_table, digits=2, row.names = FALSE, format="simple")
```{whites, eval=FALSE, echo = eval_cat}
df_factor_order <- df_factor[,order(colnames(df_factor)), drop=FALSE]
for (i in 1:ncol(df_factor)){ counts <- table(df_factor_order[i], useNA = "ifany") names(counts)[is.na(names(counts))] <- "Missing" counts <- counts[order(counts)]
# Plot by case (e.g. category names length) if (any(nchar(names(counts), type = "chars") >= 11) || length(counts) > 12){
if(length(counts) > 40){ # Bar-Plot with suppressed category names par(mar = c(6,6,4.1, 2.1), mgp = c(3, 1, 0)) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), xaxt="n", ylab = "Frequency", cex.names = 0.6, las = 2, xlab = colnames(df_factor_order[i]), ylim = range(pretty(c(0,counts)))) } else { # Bar-Plot with shortened category names par(mar = c(8, 8, 4.1, 2.1), mgp = c(6, 1, 0)) names(counts) <- substr(names(counts), 1, 15) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", cex.names = 0.65, xlab= colnames(df_factor_order[i]), las=2, ylim=range(pretty(c(0,counts)))) }
} else { # Bar-Plot with full-length names par(mar = c(6,6, 4.1, 2.1), mgp = c(5, 1, 0)) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", cex.names = 0.7, ylim=range(pretty(c(0,counts))), xlab= colnames(df_factor_order[i])) } }
```{whites, eval=FALSE, echo = eval_catcol} # Categorical variables ## Descriptive graphics: Bar-Plots Summary ### Function for Bar-Plot per variable plot_bar <- function(i){ counts <- table(df_factor_order[i], useNA = "ifany") names(counts)[is.na(names(counts))] <- "Missing" names(counts)[names(counts)=="NA"] <- "Missing" counts <- counts[order(counts)] barplot(counts, col = "#2fa42d", main = colnames(df_factor_order[i]), ylab = "Frequency", xaxt="n", ylim=range(pretty(c(0,counts)))) } ### Set graphical parameters par(mfrow=c(ceiling(sqrt(length(df_factor))), ceiling(sqrt(length(df_factor)))), mar=c(1.5,1,2,1), oma= c(1,1,1,1)) ### Loop over variables for(i in 1:ncol(df_factor)) plot_bar(i) ### Restore original graphical settings par(opar)
```{whites, eval=FALSE, echo = eval_cat}
freqtable <- function(col){
# Replace NA with "Missing" col[is.na(col)] <- "Missing"
# Create table with frequencies pie_table_unsorted <- as.data.frame(table(col)) pie_table_sorted <- pie_table_unsorted[order(pie_table_unsorted$Freq, decreasing=TRUE),] colnames(pie_table_sorted) <- c("Category", "Frequency")
# If more than 20 categories: summarize the smallest categories to one category if (nrow(pie_table_sorted)>20){ pie_table_sorted$Category <- as.character(pie_table_sorted$Category) pie_table_summarized <- rbind(pie_table_sorted[c(1:20),], c(as.character("All Other Values"), sum(pie_table_sorted$Frequency[-c(1:20)]))) pie_table_sorted <- pie_table_summarized } pie_table_sorted$RelFreq <- as.numeric(pie_table_sorted$Frequency) / length(col) return(pie_table_sorted) }
plot_pie <- function(table, title, title_size, legend_pos){
# Direction of the legend
if (max(nchar(as.character(table[,1])))>15){
legend = "vertical"
} else {
legend = "horizontal"
}
plot <-
ggplot(table, aes(x = "", y = RelFreq,
fill = reorder(Category, RelFreq))) +
guides(fill = guide_legend(title="", reverse = FALSE, direction = legend)) +
ggtitle(title) +
geom_col() +
geom_text(aes(label = scales::percent(RelFreq,accuracy = 0.01)),
position = position_stack(vjust = 0.5), size = 2) +
coord_polar("y", start = 0) +
theme(axis.title.x = element_blank(),axis.title.y = element_blank(),
axis.ticks = element_blank(), panel.grid = element_blank(),
axis.text = element_blank(), legend.position = legend_pos,
panel.background = element_blank(), plot.title = title_size)
return(plot)
}
for(i in 1:ncol(df_factor)){ table <- freqtable(df_factor_order[,i]) title <- paste("Pie Chart of ", colnames(df_factor_order[i])) title_size <- element_text(hjust = 0.5, face = "bold") legend_pos <- "right" print(plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos)) }
```{whites, eval=FALSE, echo = eval_catcol} # Categorical variables ## Descriptive graphics: Pie-Plots Summary ### Save variable plots in a list plots <- list() for (i in 1:ncol(df_factor)){ title <- substr(colnames(df_factor_order[i]), 1, 19) title_size <- element_text(size = 30 / min(25, ceiling(sqrt(ncol(df_factor)))), face = "bold", hjust = 0.5) legend_pos <- "none" table <- freqtable(df_factor_order[,i]) plots[[i]] <- plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos) } ### Summary Plot grid.arrange(grobs = plots, ncol = ceiling(sqrt(ncol(df_factor_order))))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.