Nothing
knitr::opts_chunk$set(echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.width=8)
library(knitr) # kable library(psych) library(Hmisc) library(reshape2) library(ggplot2) library(gridExtra) # for grid.arrange library(PerformanceAnalytics) # for chart.ECDF library(car) # for qqPlot library(kableExtra) # Get data df <- params$data df_code <- df # Initialize further chunks eval0 <- FALSE eval <- FALSE tryCatch({ df <- df[,params$vars1,drop=FALSE] df2 <- df # Initialize next computations eval0 <- TRUE }, error=function(e) { stop(safeError("Please try other column names for the following columns: ")) } ) if (length(setdiff(params$vars1,colnames(df))) >0) { equal <- intersect(colnames(df),params$vars1) kable(setdiff(params$vars1,equal),col.names = "Column") }
# Call used libraries library(psych) library(Hmisc) library(reshape2) library(ggplot2) library(gridExtra) # for grid.arrange library(PerformanceAnalytics) # for chart.ECDF library(car) # for qqPlot library(kableExtra) # Initialize next computations eval <- FALSE eval_rows <- FALSE tryCatch({ # Drop columns if all observations are missing col_names_missing <- sapply(df, function(col) all(is.na(col))) df[ ,col_names_missing] <- list(NULL) df_list <- df # Drop empty rows rowsums <- data.frame(sapply(df,is.na)) if (length(which(rowSums(rowsums) == dim(df)[2])) != 0L){ eval_rows <- TRUE rows_drop <- (which(rowSums(rowsums) == dim(df)[2])) length_non_complete <- length(which(rowSums(rowsums) == dim(df)[2])) df <- df[-rows_drop, ,drop=FALSE] } # Convert logical variables to character cols_logical <- sapply(df, function(col) is.logical(col)) df[ ,cols_logical] <- sapply(df[ ,cols_logical], as.character) # Convert numerical variables with less than 7 unique values to character (missing values omitted) col_names_numeric <- sapply(df, function(col) length(unique(na.omit(col))) < 7L & is.numeric(col)) df[ ,col_names_numeric] <- sapply(df[ ,col_names_numeric], as.character) # Extract numerical variables df_num <- df[which(sapply(df, is.numeric) == 1L)] # Extract approximate continuous variables and non-continuous var if (ncol(df_num)>0){ rateunique_df <- sapply(df_num, function(col) continuous(col)) cols_continuous <- names(which(rateunique_df == TRUE)) df_cont <- df_num[,rateunique_df,drop=FALSE] # numeric, continuous resp. assumption fulfilled df_noncont <- df_num[,!rateunique_df,drop=FALSE] # numeric, non-continuous } # Extract character variables df_factor <- df[which(sapply(df, is.character) == 1L)] # Categorical if (exists("df_noncont")){ df_cat <- merge(df_factor, df_noncont, by="row.names") df_cat$Row.names <- NULL df_cat$Row.names.y <- NULL } else { df_cat <- df_factor } # Initialize next computations eval <- TRUE }, error=function(e) { stop(safeError("Dataset cannot be prepared. Please check the data for consistency.")) } )
# Chunk with first page of basic information cat("\n# Basic Information", fill=TRUE) cat("\\small ", fill=TRUE) cat("Automatic statistics for the file:", fill=TRUE) dataname <- params$filename[1] kable(dataname, col.names = "File", linesep = '', longtable=T) cat("Your selection for the encoding:", fill=TRUE) if (params$fencoding=="unknown"){ cat("Auto") } else {cat("UTF-8")} cat("\\newline",fill=TRUE) cat("Your selection for the decimal character:", fill=TRUE) if (params$decimal=="auto"){ cat("Auto") } else {cat(params$decimal)} cat("\\newline",fill=TRUE) cat("Observations (rows with at least one non-missing value): ", fill=TRUE) cat(dim(df)[1]) cat("\\newline",fill=TRUE) # Missing rows if (exists("length_non_complete")){ cat("Number of rows that are dropped because they contain no values (all values are missing):", length_non_complete) cat("\\newline",fill=TRUE) } cat("Variables (columns with at least one non-missing value): ", fill=TRUE) cat(dim(df_list)[2]) cat("\\newline",fill=TRUE) # Missing columns if (exists("col_names_missing")){ if (sum(col_names_missing) != 0L){ cat("Number of columns that are dropped because they contain no values (all values are missing):", sum(col_names_missing), fill=TRUE) cat("\\newline",fill=TRUE) } } if (exists("df_cont")){ cat("Variables considered continuous: ", fill=TRUE) if (ncol(df_cont)>0){ cat(ncol(df_cont),fill=TRUE) knitr::kable(cols_continuous, col.names = "Variables considered continuous", linesep = '', longtable=T) %>% kable_styling(font_size = 8, position = "center", full_width = FALSE, latex_options = c("HOLD_position","repeat_header")) } else { cat("0", fill=TRUE) cat("\\newline",fill=TRUE) } } if (exists("df_cat")){ cat("Variables considered categorical: ", fill=TRUE) if (ncol(df_cat)>0){ cat(ncol(df_cat),fill=TRUE) knitr::kable(colnames(df_cat), col.names = "Variables considered categorical", linesep = '', longtable=T) %>% kable_styling(font_size = 8, position = "center", full_width = FALSE, latex_options = c("HOLD_position","repeat_header")) } else { cat("0", fill=TRUE) cat("\\newline",fill=TRUE) } }
# Numeric falsly to char? check_reading <- function(col){ numeric <- !is.na(as.numeric(col)) return(sum(numeric)/sum(!is.na(col))) } df_char2 <- df2[which(sapply(df2, is.character) == 1L)] numeric_percent <- sapply(df_char2, function(col) check_reading(col)) if (length(numeric_percent[(numeric_percent>0.9)]) != 0L){ cat("**Warning: More than 90% of the values of these columns could be treated as numeric. Nevertheless, because of some values or the selected decimal character, the columns must be treated as discrete. Are all the values plausible? Please check the data once more before uploading! Column(s):**", names(numeric_percent[(numeric_percent>0.9)]), fill=TRUE) }
\pagebreak
# Instead of updating the rest of the code: if (exists("df_cont")) df_num <- df_cont if (exists("df_cat")) df_factor <- df_cat
# Title if (exists("df_num")){ if (dim(df_num)[2] != 0L){ cat("# Results for Numerical Variables", fill=TRUE) cat("## Descriptive Statistics", fill=TRUE) cat("Variables are sorted alphabetically. Missings are omitted in the stats. CV only for positive variables. ", fill=TRUE) } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] != 0L){ # Continuous variables ## Descriptive statistics table ### Take over summary from psych package and add new stats stats_new <- psych::describe(df_num) ### Select variables to drop stats_new <- as.data.frame(stats_new) stats_new <- stats_new[c(-1,-6,-10,-13)] ### Add new variables stats_new$Variable <- colnames(df_num) stats_new$ntotal <- nrow(df_num) ### Missings stats_new$miss <- sapply(df_num, function(col) sum(is.na(col))) ### Complete rate stats_new$complete <- sapply(df_num, function(col) (1-(sum(is.na(col)) / nrow(df_num)))*100) ### N Unique stats_new$N_Unique <- sapply(df_num, function(col) length(unique(na.omit(col)))) ### CV stats_new$CV <- sapply(df_num, function(col) { ifelse(any(col <= 0, na.rm=TRUE), "-", round((sd(col, na.rm=TRUE) / mean(col, na.rm=TRUE)),2)) }) ### Reorder columns stats_new <- stats_new[,c(10,11,12,1,13,14,2:9,15)] ### Column names colnames(stats_new) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete", "N Unique", "Mean", "SD", "Median", "MAD", "MIN", "MAX", "Skewness", "Kurtosis", "CV") # Order by variable name stats_new <- stats_new[order(stats_new$Variable),] ### Column size csize <- min(max(nchar(stats_new$Variable)),25) stats_new$Variable <- substr(stats_new$Variable,1,csize) ### Output table knitr::kable(stats_new, digits=2, row.names = FALSE, longtable=T, booktabs=T, linesep = "") %>% kable_styling(font_size = 8, latex_options = c("scale_down", "repeat_header")) }}}, error=function(e) message(e) )
\pagebreak
if (exists("df_num")){ if (dim(df_num)[2] >1){ cat("## Graphics", fill=TRUE) cat("### Histograms", fill=TRUE) if(dim(df_num)[2]>1){ cat("One Relative Frequency Histogram per page for each variable. Variables are sorted alphabetically. The blue line represents the normal density approximation. The blue dotted line represents a special kernel density approximation. ", fill=TRUE) } } }
if (exists("df_num")){ if (dim(df_num)[2] == 1){ cat("## Graphics", fill=TRUE) cat("### Histograms", fill=TRUE) if(dim(df_num)[2]==1){ cat("Relative Frequency Histogram. The blue line represents the normal density approximation. The blue dotted line represents a special kernel density approximation. ", fill=TRUE) } } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] != 0L){ # Numerical variables # Graphics: Histograms Large # For each variable df_num_order <- df_num[,order(colnames(df_num)),drop=FALSE] # Function to plot histogram for each variable single_hist <- function(x, main = "Histogram", ylab="Relative Frequency", xlab=NULL, freq=FALSE, bcol="#2fa42d", dcol=c("#396e9f","#396e9f"), dlty=c("dotted", "solid"), breaks=21) { h <- hist(x, plot=FALSE, breaks=breaks) m <- mean(x, na.rm=TRUE) s <- sd(x, na.rm=TRUE) d <- density(x, na.rm=TRUE) ## Set nice x and y axis limits xlims <- pretty(c(floor(h$breaks[1]),ceiling(last(h$breaks)))) ymax <- max(h$density) dmax <- max(d$y) ymax <- max(ymax,dmax) ## Plots plot(h, freq=freq, ylim=c(0, ymax*1.2), ylab=ylab, xlab=xlab, main=main, col=bcol, xlim = c(min(xlims), max(xlims))) lines(d, lty=dlty[1], col=dcol[1]) curve(dnorm(x,m,s), add=TRUE, lty=dlty[2], col=dcol[2]) } # Loop over variables for (i in 1:ncol(df_num_order)){ single_hist(df_num_order[,i], main = paste("Histogram of ", colnames(df_num_order[i]))) cat("\n\n\\pagebreak\n") } }}}, error=function(e) message(e) )
# Title if (exists("df_num")){ if (dim(df_num)[2] >1){ cat("### Histograms Summary", fill = TRUE) cat("Multiple Relative Frequency Histogram in one figure. Variables are sorted alphabetically. The blue line represents the normal density approximation. The blue dotted line represents a special kernel density approximation. ", fill=TRUE) } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] > 1){ # Numerical variables # Graphics: Histogram Summary # All par(mar=c(1.5,1,2,1.5), oma=c(1,1,1,1)) k <- ceiling(dim(df_num)[2]/20)-1 for (i in 0:k){ m <- 20*i+1 n <- min(20*(i+1),dim(df_num)[2]) multi.hist(df_num_order[,m:n], dcol=c("#396e9f","#396e9f"), bcol= "#2fa42d", dlty=c("dotted", "solid"), main = colnames(df_num_order[,m:n])) } }}}, error=function(e) message(e))
\pagebreak
if (exists("df_num")){ if (dim(df_num)[2] != 0L){ cat("### Box-Plots", fill=TRUE) if(dim(df_num)[2]>1){ cat("One Box-Plot per page for each variable. Variables are sorted alphabetically. ", fill=TRUE) } } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] != 0L){ # Numerical variables # Graphics: Boxplot Large # For each variable for (i in 1:ncol(df_num_order)){ boxplot(df_num_order[,c(i)], col = "#2fa42d", main = paste("Boxplot of",colnames(df_num_order[i])), xlab=paste(colnames(df_num_order[i])), horizontal = TRUE) cat("\n\n\\pagebreak\n") } }}}, error=function(e) message(e) )
\pagebreak
# Title if (exists("df_num")){ if (dim(df_num)[2] >1){ cat("### Box-Plots Summary", fill=TRUE) cat("Multiple Box-Plots of variables in one figure. Variables are sorted alphabetically. ", fill=TRUE) } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] >1){ # Numerical variables # Graphics: Box-Plots Summary # For each variable # Set graphical parameters if(ncol(df_num_order)>25){ par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1)) } else { par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), mar=c(1.5,1,2,1), oma= c(1,1,1,1)) } for(i in 1:ncol(df_num_order)){ boxplot(df_num_order[,c(i)], col = "#2fa42d", main = colnames(df_num_order[i]), xlab = paste(colnames(df_num_order[i])), xaxt = "n", horizontal = TRUE) } }}}, error=function(e) message(e) )
\pagebreak
if (exists("df_num")){ if (dim(df_num)[2]==1){ cat("### ECDF Plots", fill=TRUE) cat("ECDF (Empirical Cumulative Distribution Function) Plot. The blue line represents the CDF of a normal distribution. If the variable is normally distributed, the blue line approximates well the ECDF. ", fill=TRUE) } else if (dim(df_num)[2]>1){ cat("### ECDF Plots", fill=TRUE) cat(" One ECDF (Empirical Cumulative Distribution Function) Plot per page for each variable. Variables are sorted alphabetically. The blue line represents the CDF of a normal distribution. If the variable is normally distributed, the blue line approximates well the ECDF. ", fill=TRUE) } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] > 0L){ # Numerical variables # Graphics: ECDF Plots Large # For each variable for (i in 1:length(df_num_order)){ data <- as.data.frame(df_num_order[,c(i)]) colnames(data) <- "variable" # Plot ECDF step_function <- ecdf(data$variable) plot(step_function, main=paste("ECDF Plot of", colnames(df_num_order[i])), xlab=colnames(df_num_order[i]), ylab="ECDF", cex=0.7, col="#2fa42d", do.points=TRUE) # Plot CDF of normal distribution data_mean<- mean(data$variable, na.rm=TRUE) data_sd<- sd(data$variable, na.rm=TRUE) curve(pnorm(x, data_mean,data_sd), from=qnorm(0.0001, mean=data_mean, sd=data_sd), to=qnorm(0.9999, mean=data_mean, sd=data_sd), add=TRUE, col="#396e9f", lwd=2) cat("\n\n\\pagebreak\n") } }}}, error=function(e) message(e) )
# Title if (exists("df_num")){ if (dim(df_num)[2] > 1){ cat("### ECDF Plots Summary", fill=TRUE) cat("Multiple ECDF Plots of variables in one figure. Variables are sorted alphabetically. ", fill=TRUE) } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] >1){ # Numerical variables # Graphics: ECDF Plots Summary ecdf_plot <- function(i){ data <-as.data.frame(df_num_order[,c(i)]) colnames(data)<-"variable" # Plot ECDF step_function <- ecdf(data$variable) ecdf_plot <- plot(step_function, main = colnames(df_num_order[i]), xlab = colnames(df_num_order[i]), ylab = "ECDF", cex = 0.7, col="#2fa42d", do.points = FALSE) # Plot CDF of normal distribution data_mean <- mean(data$variable, na.rm=TRUE) data_sd <- sd(data$variable, na.rm=TRUE) curve(pnorm(x, data_mean,data_sd), from = qnorm(0.0001, mean = data_mean, sd = data_sd), to = qnorm(0.9999, mean = data_mean, sd = data_sd), add = TRUE, col="#396e9f", lwd=0.5,pch=1) } # Set graphical parameters if(ncol(df_num_order)>25){ par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1)) } else { par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), mar=c(1.5,1,2,1), oma= c(1,1,1,1)) } for(i in 1:ncol(df_num_order)) ecdf_plot(i) }}}, error=function(e) message(e) )
\pagebreak
if (exists("df_num")){ if (dim(df_num)[2]==1){ cat("### QQ-Plots", fill=TRUE) } else if (dim(df_num)[2]>1){ cat("### QQ-Plots", fill=TRUE) cat("One QQ-Plot per page for each variable. Variables are sorted alphabetically. ", fill=TRUE) } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] != 0L){ # Define function for the QQ-Plot qq_plot <- function(i, main, xlab, ylab){ var <- df_num_order[,i] qqplot(x = qnorm(ppoints(var), mean = mean(var, na.rm = TRUE), sd = sd(var, na.rm = TRUE)), y = var, xlim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), ylim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), main = main, xlab = xlab, ylab = ylab, col = "#2fa42d", cex=0.7, pch=19 ) abline(a = 0, b = 1, col = "#396e9f", lwd = 2) grid() } # Numerical variables # Graphics: QQ-Plot Large for (i in 1:ncol(df_num)){ qq_plot(i, main = paste("QQ-Plot of", colnames(df_num_order[i])), xlab = "Theoretical Quantiles, Normal Distribution", ylab = paste("Sample Quantiles for ", colnames(df_num_order[i])) ) cat("\n\n\\pagebreak\n") } }}}, error=function(e) message(e) )
# Title if (exists("df_num")){ if (dim(df_num)[2] >1){ cat("### QQ-Plots Summary", fill=TRUE) cat("QQ-Plots of variables in one figure. Theoretical Quantiles of the Normal Distribution. ", fill=TRUE) } }
tryCatch({ if (exists("df_num")){ if (dim(df_num)[2] >1){ # Numerical variables # Graphics: QQ-Plots Summary # Set graphical parameters if(ncol(df_num_order)>25){ par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1)) } else { par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), mar=c(1.5,1,2,1), oma= c(1,1,1,1)) } for(i in 1:ncol(df_num)) qq_plot(i, colnames(df_num_order[i]), "", "") }}}, error=function(e) message(e) )
\pagebreak
# Title if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ cat("# Results for Discrete Variables", fill=TRUE) cat("## Descriptive Statistics", fill=TRUE) cat("### Totals", fill = TRUE) cat("The table is sorted by the variable name. If any, N Unique contains the missing category.", fill=TRUE) } }
tryCatch({ if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ # Discrete variables # Descriptive statistics ### Totals table miss <- sapply(df_factor, function(col) sum(is.na(col))) complete <- sapply(df_factor, function(col) (1-(sum(is.na(col)) / nrow(df_factor)))*100) complete <- round(complete,3) totals <- data.frame(miss, complete) totals$Variable <- rownames(totals) totals$ntotal <- nrow(df_factor) totals$valid <- totals$ntotal - totals$miss totals$N_Unique <- sapply(df_factor, function(col) length(unique(col))) totals <- totals[,c(3,4,1,5,2,6)] totals <- totals[order(totals$Variable),] colnames(totals) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete","N Unique") # Column size csize <- min(max(nchar(totals$Variable)),25) totals$Variable <- substr(totals$Variable,1,csize) ## Output table kable(totals, digits=2, row.names = FALSE, longtable=T, booktabs=T, linesep = "") }}}, error=function(e) message(e) )
\pagebreak
# Title if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ cat("### Frequencies", fill = TRUE) cat("The table is sorted by the variable name. For each variable, a maximum of 20 unique values are considered, sorted in decreasing order of their frequency. If any, missings are counted as a category. ", fill=TRUE) } }
tryCatch({ if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ # Discrete variables # Descriptive statistics # Frequencies discrete <- function(i){ # Calculate individual statistics count <- table(df_factor[,i], useNA="always") perc <- as.data.frame(prop.table(count)) perc$Percent <- perc$Freq*100 perc$Freq <- NULL # Merge to one dataframe freq <- merge(count, perc, by="Var1") freq$Variable <- rep(colnames(df_factor)[i],nrow(freq)) freq <- freq[,c(4,1,2,3)] colnames(freq) <- c("Variable", "Category","Frequency", "Percent") # Rename missing category if(length(is.na(freq$Category))>0){ levels(freq$Category) <- c(levels(freq$Category),"Missing") freq$Category[is.na(freq$Category)] <- "Missing" } # Sort freq_order <- freq[order(-freq[,4],freq[,2]),] # Add category "All other values" in case of more than 20 categories min <- min(20, length(unique(df_factor[,i]))) if(min==20){ freq_order$Category <- as.character(freq_order$Category) freq_order <- rbind(freq_order[1:20,],c(colnames(df_factor)[i], as.character("****All Other Values****"), sum(freq_order$Frequency[-c(1:20)]), sum(freq_order$Percent[-c(1:20)]))) } else { freq_order <- freq_order[1:min,] } return(freq_order) } # Merge individual variable frequency tables to one table cat_table <- discrete(1) for (i in 1:dim(df_factor)[2]){ if (i>1){ cat_i <- discrete(i) cat_table <- rbind(cat_table, cat_i) } } cat_table <- cat_table[order(cat_table$Variable),] cat_table$Percent <- round(as.numeric(cat_table$Percent),2) # longest_category <- paste0(max(c(apply(as.data.frame(cat_table$Category), 1, nchar),8), na.rm=TRUE)*0.15,"cm") kable(cat_table, digits=2, row.names = FALSE, longtable=T, booktabs=T, linesep = "") %>% # column_spec(2, width=longest_category) %>% kable_styling(font_size = 7, latex_options = c("scale_down", "repeat_header"), full_width = F) }}}, error=function(e) message(e) )
\pagebreak
# Title if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ cat("## Graphics", fill=TRUE) cat("### Bar-Plots", fill=TRUE) if(dim(df_factor)[2]>1){ cat("One Bar-Plot per page for each variable. Variables are sorted alphabetically. ", fill=TRUE) } } }
tryCatch({ if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ # Discrete variables # Graphics: Barplots Large df_factor_order <- df_factor[,order(colnames(df_factor)),drop=FALSE] for (i in 1:ncol(df_factor)){ counts <- table(df_factor_order[i], useNA = "ifany") names(counts)[is.na(names(counts))] <- "Missing" counts <- counts[order(counts)] if(any(nchar(names(counts), type = "chars") >= 11) || length(counts) > 12){ # Barplot with suppressed category names if too many if(length(counts) > 40){ par(mar = c(6,6,4.1, 2.1),mgp = c(3, 1, 0)) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])),xaxt="n", ylab = "Frequency", cex.names = 0.6, las=2, xlab = colnames(df_factor_order[i]), ylim = range(pretty(c(0,counts)))) cat("\\newline", fill=TRUE) cat("More than 40 categories. Category labels are not displayed. ", fill=TRUE) } else { par(mar = c(8, 8, 4.1, 2.1), mgp = c(6, 1, 0)) # Barplot with shortened category names if names are too long names(counts) <- substr(names(counts), 1, 15) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", cex.names = 0.65, xlab = colnames(df_factor_order[i]), las = 2, ylim = range(pretty(c(0,counts)))) } } else { # Barplot with full-length names par(mar = c(6,6, 4.1, 2.1), mgp = c(5, 1, 0)) barplot(counts, col = "#2fa42d", main = paste("Barplot of ", colnames(df_factor_order[i])), ylab = "Frequency", cex.names = 0.7,ylim=range(pretty(c(0,counts))), xlab= colnames(df_factor_order[i])) } cat("\n\n\\pagebreak\n") } }}}, error=function(e) message(e) )
\pagebreak
# Title if (exists("df_factor")){ if (dim(df_factor)[2] >1){ cat("### Bar-Plots Summary") cat("\n\n\ Multiple Bar-Plots of variables in one figure. Variables are sorted alphabetically. ", fill=TRUE) } }
tryCatch({ if (exists("df_factor")){ if (dim(df_factor)[2] >1){ # Numerical variables # Graphics: Barplots Summary # For each variable plot_bar <- function(i){ counts <- table(df_factor_order[i], useNA = "ifany") names(counts)[is.na(names(counts))] <- "Missing" names(counts)[names(counts)=="NA"] <- "Missing" counts <- counts[order(counts)] barplot(counts, col = "#2fa42d", main = colnames(df_factor_order[i]), ylab = "Frequency", xaxt="n", ylim=range(pretty(c(0,counts)))) } # Set graphical parameters if(ncol(df_factor)>25){ par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1)) } else { par(mfrow=c(ceiling(sqrt(length(df_factor))), ceiling(sqrt(length(df_factor)))), mar=c(1.5,1,2,1), oma= c(1,1,1,1)) } for(i in 1:ncol(df_factor_order)) plot_bar(i) }}}, error=function(e) message(e) )
\pagebreak
if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ cat("### Pie Plots", fill=TRUE) if(dim(df_factor)[2]>1){ cat("One Pie Plot per page for each variable. Variables are sorted alphabetically. ", fill=TRUE) } } }
tryCatch({ if (exists("df_factor")){ if (dim(df_factor)[2] != 0L){ # Function to create Frequency Table for Each Variable freqtable <- function(col){ # Replace NA with "Missing" col[is.na(col)] <- "Missing" # Create table with Frequencies pie_table_unsorted <- as.data.frame(table(col)) pie_table_sorted <- pie_table_unsorted[order(pie_table_unsorted$Freq, decreasing=TRUE),] colnames(pie_table_sorted) <- c("Category", "Frequency") # If more than 20 categories: summarize the smallest categories to one category if (nrow(pie_table_sorted)>20){ pie_table_sorted$Category <- as.character(pie_table_sorted$Category) pie_table_summarized <- rbind(pie_table_sorted[c(1:20),], c(as.character("All Other Values"), sum(pie_table_sorted$Frequency[-c(1:20)]))) pie_table_sorted <- pie_table_summarized } pie_table_sorted$RelFreq <- as.numeric(pie_table_sorted$Frequency) / length(col) return(pie_table_sorted) } ## Plot-Function plot_pie <- function(table, title, title_size, legend_pos){ # Direction of the legend for large category names if (max(nchar(as.character(table[,1])))>15){ legend = "vertical" } else { legend = "horizontal" } plot <- ggplot(table, aes(x = "", y = RelFreq, fill = reorder(Category, RelFreq))) + guides(fill = guide_legend(title="", reverse = FALSE, direction = legend)) + ggtitle(title) + geom_col() + geom_text(aes(label = scales::percent(RelFreq,accuracy = 0.01)), position = position_stack(vjust = 0.5), size = 2) + coord_polar("y", start = 0) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank(), axis.text = element_blank(), legend.position = legend_pos, panel.background = element_blank(), plot.title = title_size) return(plot) } for(i in 1:ncol(df_factor_order)){ table <- freqtable(df_factor_order[,i]) title <- paste("Pie Chart of ", colnames(df_factor_order[i])) title_size <- element_text(hjust = 0.5, face = "bold") legend_pos <- "right" print(plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos)) cat("\n\n\\pagebreak\n") } }}}, error=function(e) message(e) )
# Title if (exists("df_factor")){ if (dim(df_factor)[2] >1){ cat("### Pie Plots Summary", fill=TRUE) cat("Multiple Pie Plots of variables in one figure. Variables are sorted alphabetically. ", fill=TRUE) } }
tryCatch({ if (exists("df_factor")){ if (dim(df_factor)[2] >1){ plots <- list() for (i in 1:ncol(df_factor_order)){ title <- substr(colnames(df_factor_order[i]), 1, 19) title_size <- element_text(size = 30 / min(25, ceiling(sqrt(ncol(df_factor_order)))), face = "bold", hjust = 0.5) legend_pos <- "none" table <- freqtable(df_factor_order[,i]) plots[[i]] <- plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos) } if (ncol(df_factor_order)>25){ for(i in 1:ceiling(ncol(df_factor_order)/25)){ index <- seq((i-1)*25+1,min(i*25,ncol(df_factor_order)), by=1) plots_2 <- plots[index] grid.arrange(grobs=plots_2, ncol=5) } } else { grid.arrange(grobs = plots, ncol = ceiling(sqrt(ncol(df_factor_order)))) } }}}, error=function(e) message(e) )
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.