Exploratory Data Analysis (EDA)"

knitr::opts_chunk$set(echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.width=8)
library(knitr) # kable
library(psych)
library(Hmisc)
library(reshape2)
library(ggplot2)
library(gridExtra) # for grid.arrange
library(PerformanceAnalytics) # for chart.ECDF
library(car) # for qqPlot
library(kableExtra)

# Get data
df <- params$data
df_code <- df

# Initialize further chunks
eval0 <- FALSE
eval <- FALSE

tryCatch({

  df <- df[,params$vars1,drop=FALSE]
  df2 <- df

  # Initialize next computations
  eval0 <- TRUE

}, error=function(e) {

  stop(safeError("Please try other column names for the following columns: "))
}

)

if (length(setdiff(params$vars1,colnames(df))) >0) {
  equal <- intersect(colnames(df),params$vars1)
  kable(setdiff(params$vars1,equal),col.names = "Column")
}
# Call used libraries 
library(psych)
library(Hmisc)
library(reshape2)
library(ggplot2)
library(gridExtra) # for grid.arrange
library(PerformanceAnalytics) # for chart.ECDF
library(car) # for qqPlot
library(kableExtra)

# Initialize next computations
eval <- FALSE
eval_rows <- FALSE

tryCatch({

# Drop columns if all observations are missing 
col_names_missing <- sapply(df, function(col) all(is.na(col)))
df[ ,col_names_missing] <- list(NULL)
df_list <- df 

# Drop empty rows
rowsums <- data.frame(sapply(df,is.na))
if (length(which(rowSums(rowsums) == dim(df)[2])) != 0L){
  eval_rows <- TRUE
  rows_drop <- (which(rowSums(rowsums) == dim(df)[2]))
  length_non_complete <- length(which(rowSums(rowsums) == dim(df)[2]))
  df <- df[-rows_drop, ,drop=FALSE]
}

# Convert logical variables to character
cols_logical <- sapply(df, function(col) is.logical(col))
df[ ,cols_logical] <- sapply(df[ ,cols_logical], as.character)

# Convert numerical variables with less than 7 unique values to character (missing values omitted)
col_names_numeric <- sapply(df, function(col) length(unique(na.omit(col))) < 7L & is.numeric(col))
df[ ,col_names_numeric] <- sapply(df[ ,col_names_numeric], as.character)

# Extract numerical variables 
df_num <- df[which(sapply(df, is.numeric) == 1L)]

# Extract approximate continuous variables and non-continuous var
if (ncol(df_num)>0){

  rateunique_df <- sapply(df_num, function(col) continuous(col))
  cols_continuous <- names(which(rateunique_df == TRUE))
  df_cont <- df_num[,rateunique_df,drop=FALSE] # numeric, continuous resp. assumption fulfilled 
  df_noncont <- df_num[,!rateunique_df,drop=FALSE] # numeric, non-continuous 

} 

# Extract character variables 
df_factor <- df[which(sapply(df, is.character) == 1L)]

# Categorical 
if (exists("df_noncont")){
  df_cat <- merge(df_factor, df_noncont, by="row.names")
  df_cat$Row.names <- NULL
  df_cat$Row.names.y <- NULL
} else {
  df_cat <- df_factor
}

# Initialize next computations
eval <- TRUE

}, error=function(e) {

  stop(safeError("Dataset cannot be prepared. Please check the data for consistency."))

}

)
# Chunk with first page of basic information

cat("\n# Basic Information", fill=TRUE)
cat("\\small ", fill=TRUE)
cat("Automatic statistics for the file:", fill=TRUE)
dataname <- params$filename[1]
kable(dataname, col.names = "File", linesep = '', longtable=T) 

cat("Your selection for the encoding:", fill=TRUE)
if (params$fencoding=="unknown"){
  cat("Auto")
} else {cat("UTF-8")}
cat("\\newline",fill=TRUE) 

cat("Your selection for the decimal character:", fill=TRUE)
if (params$decimal=="auto"){
  cat("Auto")
} else {cat(params$decimal)}
cat("\\newline",fill=TRUE) 

cat("Observations (rows with at least one non-missing value): ", fill=TRUE)
cat(dim(df)[1])
cat("\\newline",fill=TRUE) 

# Missing rows
if (exists("length_non_complete")){
  cat("Number of rows that are dropped because they contain no values (all values are missing):", length_non_complete)
  cat("\\newline",fill=TRUE) 
}

cat("Variables (columns with at least one non-missing value): ", fill=TRUE)
cat(dim(df_list)[2])
cat("\\newline",fill=TRUE) 


# Missing columns
if (exists("col_names_missing")){
  if (sum(col_names_missing) != 0L){
    cat("Number of columns that are dropped because they contain no values (all values are missing):", sum(col_names_missing), fill=TRUE)
    cat("\\newline",fill=TRUE) 
  } 
}


if (exists("df_cont")){
  cat("Variables considered continuous: ", fill=TRUE)
  if (ncol(df_cont)>0){
    cat(ncol(df_cont),fill=TRUE)
    knitr::kable(cols_continuous, col.names = "Variables considered continuous", linesep = '', longtable=T) %>%
      kable_styling(font_size = 8, position = "center", full_width = FALSE, latex_options = c("HOLD_position","repeat_header"))
  } else {
    cat("0", fill=TRUE)
    cat("\\newline",fill=TRUE) 
  }
}


if (exists("df_cat")){
  cat("Variables considered categorical: ", fill=TRUE)
  if (ncol(df_cat)>0){
    cat(ncol(df_cat),fill=TRUE)
    knitr::kable(colnames(df_cat), col.names = "Variables considered categorical", linesep = '', longtable=T) %>%
      kable_styling(font_size = 8, position = "center", full_width = FALSE, latex_options = c("HOLD_position","repeat_header"))
  } else {
    cat("0", fill=TRUE)
    cat("\\newline",fill=TRUE) 
  }
}
# Numeric falsly to char? 
check_reading <- function(col){
  numeric <- !is.na(as.numeric(col))
  return(sum(numeric)/sum(!is.na(col)))
}

df_char2 <- df2[which(sapply(df2, is.character) == 1L)]
numeric_percent <- sapply(df_char2, function(col) check_reading(col))

if (length(numeric_percent[(numeric_percent>0.9)]) != 0L){
  cat("**Warning: More than 90% of the values of these columns could be treated as numeric. Nevertheless, because of some values or the selected decimal character, the columns must be treated as discrete. Are all the values plausible? Please check the data once more before uploading! Column(s):**", names(numeric_percent[(numeric_percent>0.9)]), fill=TRUE)
}

\pagebreak

# Instead of updating the rest of the code:
if (exists("df_cont")) df_num <- df_cont
if (exists("df_cat")) df_factor <- df_cat
# Title 
if (exists("df_num")){
  if (dim(df_num)[2] != 0L){
    cat("# Results for Numerical Variables", fill=TRUE)
    cat("## Descriptive Statistics", fill=TRUE)
    cat("Variables are sorted alphabetically. Missings are omitted in the stats. CV only for positive variables. ", fill=TRUE)

  } 
}
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] != 0L){

  # Continuous variables
  ## Descriptive statistics table

  ### Take over summary from psych package and add new stats 
  stats_new <- psych::describe(df_num)

  ### Select variables to drop 
  stats_new <- as.data.frame(stats_new)
  stats_new <- stats_new[c(-1,-6,-10,-13)]

  ### Add new variables
  stats_new$Variable <- colnames(df_num)
  stats_new$ntotal <- nrow(df_num)

  ### Missings
  stats_new$miss <- sapply(df_num, function(col) sum(is.na(col)))

  ### Complete rate
  stats_new$complete <- sapply(df_num, function(col) (1-(sum(is.na(col)) / nrow(df_num)))*100)

  ### N Unique
  stats_new$N_Unique <- sapply(df_num, function(col) length(unique(na.omit(col))))

  ### CV
  stats_new$CV <- sapply(df_num, function(col) {
     ifelse(any(col <= 0, na.rm=TRUE), "-", round((sd(col, na.rm=TRUE) / mean(col, na.rm=TRUE)),2))
    })

  ### Reorder columns 
  stats_new <- stats_new[,c(10,11,12,1,13,14,2:9,15)]

  ### Column names 
  colnames(stats_new) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete", "N Unique", "Mean", "SD", "Median", "MAD", "MIN", "MAX",
                        "Skewness", "Kurtosis", "CV")

  # Order by variable name
  stats_new <- stats_new[order(stats_new$Variable),]

  ### Column size
  csize <- min(max(nchar(stats_new$Variable)),25)
  stats_new$Variable <- substr(stats_new$Variable,1,csize)

  ### Output table 
  knitr::kable(stats_new, digits=2, row.names = FALSE, longtable=T, booktabs=T, linesep = "") %>%
    kable_styling(font_size = 8,   latex_options = c("scale_down", "repeat_header")) 

}}}, error=function(e) message(e)
)

\pagebreak

if (exists("df_num")){
  if (dim(df_num)[2] >1){
    cat("## Graphics", fill=TRUE)
    cat("### Histograms", fill=TRUE)
    if(dim(df_num)[2]>1){
      cat("One Relative Frequency Histogram per page for each variable. Variables are sorted alphabetically. The blue line represents the normal density approximation. The blue dotted line represents a special kernel density approximation.  ", fill=TRUE)
    }


  } 
}
if (exists("df_num")){
  if (dim(df_num)[2] == 1){
    cat("## Graphics", fill=TRUE)
    cat("### Histograms", fill=TRUE)
    if(dim(df_num)[2]==1){
      cat("Relative Frequency Histogram. The blue line represents the normal density approximation. The blue dotted line represents a special kernel density approximation.  ", fill=TRUE)
    }
  } 
}
tryCatch({

if (exists("df_num")){
  if (dim(df_num)[2] != 0L){

    # Numerical variables
    # Graphics: Histograms Large
    # For each variable
    df_num_order <- df_num[,order(colnames(df_num)),drop=FALSE] 

    # Function to plot histogram for each variable 
    single_hist <- function(x, main = "Histogram", 
                        ylab="Relative Frequency", xlab=NULL, freq=FALSE, bcol="#2fa42d",
                        dcol=c("#396e9f","#396e9f"), dlty=c("dotted", "solid"), 
                        breaks=21) {

      h <-  hist(x, plot=FALSE, breaks=breaks)
      m <- mean(x, na.rm=TRUE)
      s <- sd(x, na.rm=TRUE)
      d <- density(x, na.rm=TRUE)

      ## Set nice x and y axis limits 
      xlims <- pretty(c(floor(h$breaks[1]),ceiling(last(h$breaks))))
      ymax <- max(h$density)
      dmax <- max(d$y)
      ymax <- max(ymax,dmax)

      ## Plots 
      plot(h, freq=freq, ylim=c(0, ymax*1.2), ylab=ylab, xlab=xlab, 
           main=main, col=bcol,  xlim = c(min(xlims), max(xlims)))
      lines(d, lty=dlty[1], col=dcol[1])
      curve(dnorm(x,m,s), add=TRUE, lty=dlty[2], col=dcol[2])

    }


    # Loop over variables
    for (i in 1:ncol(df_num_order)){

      single_hist(df_num_order[,i], main = paste("Histogram of ", colnames(df_num_order[i])))
      cat("\n\n\\pagebreak\n")

    }

}}}, error=function(e) message(e)
)
# Title
if (exists("df_num")){
  if (dim(df_num)[2] >1){
  cat("### Histograms Summary", fill = TRUE)
  cat("Multiple Relative Frequency Histogram in one figure. Variables are sorted alphabetically. The blue line represents the normal density approximation. The blue dotted line represents a special kernel density approximation.  ", fill=TRUE)
  } 
}
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] > 1){

  # Numerical variables
  # Graphics: Histogram Summary
  # All
  par(mar=c(1.5,1,2,1.5), oma=c(1,1,1,1))
  k <- ceiling(dim(df_num)[2]/20)-1
  for (i in 0:k){
    m <- 20*i+1
    n <- min(20*(i+1),dim(df_num)[2])
    multi.hist(df_num_order[,m:n], dcol=c("#396e9f","#396e9f"), 
               bcol= "#2fa42d", 
               dlty=c("dotted", "solid"), 
               main = colnames(df_num_order[,m:n])) 
  }
}}}, error=function(e) message(e))

\pagebreak

if (exists("df_num")){
  if (dim(df_num)[2] != 0L){
    cat("### Box-Plots", fill=TRUE)
    if(dim(df_num)[2]>1){
      cat("One Box-Plot per page for each variable. Variables are sorted alphabetically.  ", fill=TRUE)
    }
  } 
}
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] != 0L){

  # Numerical variables
  # Graphics: Boxplot Large 
  # For each variable
  for (i in 1:ncol(df_num_order)){
    boxplot(df_num_order[,c(i)], col = "#2fa42d", 
         main = paste("Boxplot of",colnames(df_num_order[i])),
         xlab=paste(colnames(df_num_order[i])), horizontal = TRUE)
    cat("\n\n\\pagebreak\n")
  }

}}}, error=function(e) message(e)
)

\pagebreak

# Title
if (exists("df_num")){
  if (dim(df_num)[2] >1){
  cat("### Box-Plots Summary", fill=TRUE)
  cat("Multiple Box-Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  } 
} 
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] >1){

  # Numerical variables
  # Graphics: Box-Plots Summary
  # For each variable

  # Set graphical parameters 
  if(ncol(df_num_order)>25){
     par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1))
  } else { 
    par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), 
        mar=c(1.5,1,2,1), oma= c(1,1,1,1))
    }

   for(i in 1:ncol(df_num_order)){
      boxplot(df_num_order[,c(i)], col = "#2fa42d", main = colnames(df_num_order[i]),
         xlab = paste(colnames(df_num_order[i])), xaxt = "n", horizontal = TRUE)
   }

}}}, error=function(e) message(e)
)

\pagebreak

if (exists("df_num")){
  if (dim(df_num)[2]==1){
    cat("### ECDF Plots", fill=TRUE)
    cat("ECDF (Empirical Cumulative Distribution Function) Plot.  The blue line represents the CDF of a normal distribution. If the variable is normally distributed, the blue line approximates well the ECDF.  ", fill=TRUE)
  } else if (dim(df_num)[2]>1){
    cat("### ECDF Plots", fill=TRUE)
    cat(" One ECDF (Empirical Cumulative Distribution Function) Plot per page for each variable. Variables are sorted alphabetically. The blue line represents the CDF of a normal distribution. If the variable is normally distributed, the blue line approximates well the ECDF.  ", fill=TRUE)
  }
}
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] > 0L){

  # Numerical variables
  # Graphics: ECDF Plots Large
  # For each variable
  for (i in 1:length(df_num_order)){

    data <- as.data.frame(df_num_order[,c(i)])
    colnames(data) <- "variable"

    # Plot ECDF
    step_function <- ecdf(data$variable)
    plot(step_function,
        main=paste("ECDF Plot of", colnames(df_num_order[i])),
        xlab=colnames(df_num_order[i]), ylab="ECDF",
        cex=0.7, col="#2fa42d", do.points=TRUE)

    # Plot CDF of normal distribution 
    data_mean<- mean(data$variable, na.rm=TRUE)
    data_sd<- sd(data$variable, na.rm=TRUE)
    curve(pnorm(x, data_mean,data_sd),
          from=qnorm(0.0001, mean=data_mean, sd=data_sd), 
          to=qnorm(0.9999, mean=data_mean, sd=data_sd), 
          add=TRUE, col="#396e9f", lwd=2)

    cat("\n\n\\pagebreak\n")
  }

}}}, error=function(e) message(e)
)
# Title
if (exists("df_num")){
  if (dim(df_num)[2] > 1){
  cat("### ECDF Plots Summary", fill=TRUE)
  cat("Multiple ECDF Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  } 
} 
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] >1){

  # Numerical variables
  # Graphics: ECDF Plots Summary
  ecdf_plot <- function(i){

    data <-as.data.frame(df_num_order[,c(i)])
    colnames(data)<-"variable"

    # Plot ECDF
    step_function <- ecdf(data$variable)
    ecdf_plot <- plot(step_function,
                    main = colnames(df_num_order[i]),
                    xlab = colnames(df_num_order[i]), ylab = "ECDF",
                    cex = 0.7, col="#2fa42d", do.points = FALSE)

    # Plot CDF of normal distribution
    data_mean <- mean(data$variable, na.rm=TRUE)
    data_sd <- sd(data$variable, na.rm=TRUE)
    curve(pnorm(x, data_mean,data_sd),
          from = qnorm(0.0001, mean = data_mean, sd = data_sd), 
          to = qnorm(0.9999, mean = data_mean, sd = data_sd), 
          add = TRUE, col="#396e9f", lwd=0.5,pch=1)
  }

  # Set graphical parameters 
  if(ncol(df_num_order)>25){
     par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1))
  } else { 
    par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), 
        mar=c(1.5,1,2,1), oma= c(1,1,1,1))
  }

  for(i in 1:ncol(df_num_order)) ecdf_plot(i)

}}},  error=function(e) message(e)
)

\pagebreak

if (exists("df_num")){
  if (dim(df_num)[2]==1){
    cat("### QQ-Plots", fill=TRUE)
  } else if (dim(df_num)[2]>1){
    cat("### QQ-Plots", fill=TRUE)
    cat("One QQ-Plot per page for each variable. Variables are sorted alphabetically.  ", fill=TRUE)
  }
}
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] != 0L){

  # Define function for the QQ-Plot  
  qq_plot <- function(i, main, xlab, ylab){
    var <- df_num_order[,i]
    qqplot(x = qnorm(ppoints(var), mean = mean(var, na.rm = TRUE), 
                     sd = sd(var, na.rm = TRUE)),
         y = var,
         xlim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), 
         ylim = c(min(var, na.rm = TRUE), max(var, na.rm = TRUE)), 
         main = main,
         xlab = xlab,
         ylab = ylab,
         col = "#2fa42d", cex=0.7, pch=19
         )
    abline(a = 0, b = 1, col = "#396e9f", lwd = 2)
    grid()
  }

  # Numerical variables
  # Graphics: QQ-Plot Large 
  for (i in 1:ncol(df_num)){
    qq_plot(i, main = paste("QQ-Plot of", colnames(df_num_order[i])),
            xlab = "Theoretical Quantiles, Normal Distribution", 
            ylab = paste("Sample Quantiles for ", colnames(df_num_order[i]))
            )
    cat("\n\n\\pagebreak\n")
  }

}}}, error=function(e) message(e)
)
# Title
if (exists("df_num")){
  if (dim(df_num)[2] >1){
  cat("### QQ-Plots Summary", fill=TRUE)
  cat("QQ-Plots of variables in one figure. Theoretical Quantiles of the Normal Distribution.
  ", fill=TRUE)
  } 
} 
tryCatch({
if (exists("df_num")){
  if (dim(df_num)[2] >1){

  # Numerical variables
  # Graphics: QQ-Plots Summary
  # Set graphical parameters 
  if(ncol(df_num_order)>25){
     par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1))
  } else { 
    par(mfrow=c(ceiling(sqrt(length(df_num_order))), ceiling(sqrt(length(df_num_order)))), 
        mar=c(1.5,1,2,1), oma= c(1,1,1,1))
  }

  for(i in 1:ncol(df_num)) qq_plot(i, colnames(df_num_order[i]), "", "")

}}},  error=function(e) message(e)
)

\pagebreak

# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
    cat("# Results for Discrete Variables", fill=TRUE)
    cat("## Descriptive Statistics", fill=TRUE)
    cat("### Totals", fill = TRUE)
    cat("The table is sorted by the variable name. If any, N Unique contains the missing category.", fill=TRUE)
   } 
}
tryCatch({
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){

  # Discrete variables
  # Descriptive statistics 
   ### Totals table
  miss <- sapply(df_factor, function(col) sum(is.na(col)))
  complete <- sapply(df_factor, function(col) (1-(sum(is.na(col)) / nrow(df_factor)))*100) 
  complete <- round(complete,3)
  totals <- data.frame(miss, complete)
  totals$Variable <- rownames(totals)
  totals$ntotal <- nrow(df_factor)
  totals$valid <- totals$ntotal - totals$miss
  totals$N_Unique <- sapply(df_factor, function(col) length(unique(col)))
  totals <- totals[,c(3,4,1,5,2,6)]
  totals <- totals[order(totals$Variable),]
  colnames(totals) <- c("Variable", "N Obs", "N Missing", "N Valid", "% Complete","N Unique")

  # Column size
  csize <- min(max(nchar(totals$Variable)),25)
  totals$Variable <- substr(totals$Variable,1,csize)

  ## Output table 
  kable(totals, digits=2, row.names = FALSE, longtable=T, booktabs=T, linesep = "") 

}}}, error=function(e) message(e)
)

\pagebreak

# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
    cat("### Frequencies", fill = TRUE)
    cat("The table is sorted by the variable name. For each variable, a maximum of 20 unique values are considered, sorted in decreasing order of their frequency. If any, missings are counted as a category. ", fill=TRUE)
   } 
}
tryCatch({
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){

  # Discrete variables
  # Descriptive statistics
  # Frequencies
  discrete <- function(i){

  # Calculate individual statistics
  count <- table(df_factor[,i], useNA="always")
  perc <- as.data.frame(prop.table(count))
  perc$Percent <- perc$Freq*100
  perc$Freq <- NULL

  # Merge to one dataframe
  freq <- merge(count, perc, by="Var1")
  freq$Variable <- rep(colnames(df_factor)[i],nrow(freq))
  freq <- freq[,c(4,1,2,3)]
  colnames(freq) <- c("Variable", "Category","Frequency", "Percent")

  # Rename missing category
  if(length(is.na(freq$Category))>0){
    levels(freq$Category) <- c(levels(freq$Category),"Missing")
    freq$Category[is.na(freq$Category)] <- "Missing"
  }

  # Sort 
  freq_order <- freq[order(-freq[,4],freq[,2]),]


  # Add category "All other values" in case of more than 20 categories
  min <- min(20, length(unique(df_factor[,i])))
  if(min==20){
    freq_order$Category <- as.character(freq_order$Category)
    freq_order <- rbind(freq_order[1:20,],c(colnames(df_factor)[i], as.character("****All Other Values****"),
                                  sum(freq_order$Frequency[-c(1:20)]), sum(freq_order$Percent[-c(1:20)])))
  } else {
    freq_order <- freq_order[1:min,]
  }
  return(freq_order)
}

  # Merge individual variable frequency tables to one table
  cat_table <- discrete(1)
  for (i in 1:dim(df_factor)[2]){
    if (i>1){
      cat_i <- discrete(i)
      cat_table <- rbind(cat_table, cat_i)
    }
  }

 cat_table <- cat_table[order(cat_table$Variable),]
 cat_table$Percent <- round(as.numeric(cat_table$Percent),2)
# longest_category <- paste0(max(c(apply(as.data.frame(cat_table$Category), 1, nchar),8), na.rm=TRUE)*0.15,"cm")
 kable(cat_table, digits=2, row.names = FALSE, longtable=T, booktabs=T, linesep = "") %>%
  # column_spec(2, width=longest_category) %>% 
   kable_styling(font_size = 7,   latex_options = c("scale_down", "repeat_header"), full_width = F)

}}}, error=function(e) message(e)
)

\pagebreak

# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
  cat("## Graphics", fill=TRUE)
  cat("### Bar-Plots", fill=TRUE)
  if(dim(df_factor)[2]>1){
    cat("One Bar-Plot per page for each variable. Variables are sorted alphabetically.  ", fill=TRUE)
  }
  } 
}
tryCatch({
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){

    # Discrete variables
    # Graphics: Barplots Large
    df_factor_order <- df_factor[,order(colnames(df_factor)),drop=FALSE]  

   for (i in 1:ncol(df_factor)){
    counts <- table(df_factor_order[i], useNA = "ifany")
    names(counts)[is.na(names(counts))] <- "Missing"
    counts <- counts[order(counts)]

    if(any(nchar(names(counts), type = "chars") >= 11) || length(counts) > 12){

       # Barplot with suppressed category names if too many
       if(length(counts) > 40){
          par(mar = c(6,6,4.1, 2.1),mgp = c(3, 1, 0))
          barplot(counts, col = "#2fa42d", main = paste("Barplot of ", 
                                    colnames(df_factor_order[i])),xaxt="n",
                  ylab = "Frequency", cex.names = 0.6, las=2, 
                  xlab = colnames(df_factor_order[i]),
                  ylim = range(pretty(c(0,counts))))

          cat("\\newline", fill=TRUE) 
          cat("More than 40 categories. Category labels are not displayed.  ", fill=TRUE)

           } else {

             par(mar = c(8, 8, 4.1, 2.1), mgp = c(6, 1, 0))

           # Barplot with shortened category names if names are too long 
             names(counts) <- substr(names(counts), 1, 15)
             barplot(counts, col = "#2fa42d", main = paste("Barplot of ",
                                              colnames(df_factor_order[i])),
                   ylab = "Frequency", cex.names = 0.65,
                   xlab = colnames(df_factor_order[i]),
                   las = 2,
                   ylim = range(pretty(c(0,counts))))

             }

     } else {

       # Barplot with full-length names
        par(mar = c(6,6, 4.1, 2.1), mgp = c(5, 1, 0))
        barplot(counts, col = "#2fa42d", main = paste("Barplot of ",
                                              colnames(df_factor_order[i])),
                ylab = "Frequency", 
                cex.names = 0.7,ylim=range(pretty(c(0,counts))),
                xlab= colnames(df_factor_order[i]))
     }

     cat("\n\n\\pagebreak\n")
  }

}}}, error=function(e) message(e)
)

\pagebreak

# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] >1){
  cat("### Bar-Plots Summary")
  cat("\n\n\ Multiple Bar-Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  } 
} 
tryCatch({
if (exists("df_factor")){
  if (dim(df_factor)[2] >1){

    # Numerical variables
    # Graphics: Barplots Summary
    # For each variable

    plot_bar <- function(i){
      counts <- table(df_factor_order[i], useNA = "ifany")
      names(counts)[is.na(names(counts))] <- "Missing"
      names(counts)[names(counts)=="NA"] <- "Missing"
      counts <- counts[order(counts)]
      barplot(counts, col = "#2fa42d", main = colnames(df_factor_order[i]),
              ylab = "Frequency",  xaxt="n", ylim=range(pretty(c(0,counts))))
    }

  # Set graphical parameters 
  if(ncol(df_factor)>25){
     par(mfrow=c(5,5), mar=c(1.5,1,2,1), oma=c(1,1,1,1))
  } else { 
    par(mfrow=c(ceiling(sqrt(length(df_factor))), ceiling(sqrt(length(df_factor)))), 
        mar=c(1.5,1,2,1), oma= c(1,1,1,1))
  }  

  for(i in 1:ncol(df_factor_order)) plot_bar(i)

}}}, error=function(e) message(e)
)

\pagebreak

if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
    cat("### Pie Plots", fill=TRUE)
    if(dim(df_factor)[2]>1){
      cat("One Pie Plot per page for each variable. Variables are sorted alphabetically.  ", fill=TRUE)
    }
  } 
}
tryCatch({
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){

# Function to create Frequency Table for Each Variable 
freqtable <- function(col){

  # Replace NA with "Missing"
  col[is.na(col)] <- "Missing"

  # Create table with Frequencies
  pie_table_unsorted <- as.data.frame(table(col)) 
  pie_table_sorted <- pie_table_unsorted[order(pie_table_unsorted$Freq, decreasing=TRUE),]
  colnames(pie_table_sorted) <- c("Category", "Frequency")

  # If more than 20 categories: summarize the smallest categories to one category
  if (nrow(pie_table_sorted)>20){
    pie_table_sorted$Category <- as.character(pie_table_sorted$Category)
    pie_table_summarized <- rbind(pie_table_sorted[c(1:20),],
                                  c(as.character("All Other Values"), 
                                    sum(pie_table_sorted$Frequency[-c(1:20)])))
    pie_table_sorted <- pie_table_summarized
  }
  pie_table_sorted$RelFreq <- as.numeric(pie_table_sorted$Frequency) / length(col)
  return(pie_table_sorted)
}

## Plot-Function
plot_pie <- function(table, title, title_size, legend_pos){
  # Direction of the legend for large category names 
  if (max(nchar(as.character(table[,1])))>15){ 
    legend = "vertical"
  } else {
    legend = "horizontal"
  }
  plot <-
    ggplot(table, aes(x = "", y = RelFreq, 
                                 fill = reorder(Category, RelFreq))) +
    guides(fill = guide_legend(title="", reverse = FALSE, direction = legend)) +
    ggtitle(title) +
    geom_col() +
    geom_text(aes(label = scales::percent(RelFreq,accuracy = 0.01)), 
              position = position_stack(vjust = 0.5), size = 2) +
    coord_polar("y", start = 0) +
    theme(axis.title.x = element_blank(),axis.title.y = element_blank(), 
          axis.ticks = element_blank(), panel.grid = element_blank(), 
          axis.text = element_blank(), legend.position = legend_pos,
          panel.background = element_blank(), plot.title = title_size)
  return(plot)
}



for(i in 1:ncol(df_factor_order)){
  table <- freqtable(df_factor_order[,i])
  title <- paste("Pie Chart of ", colnames(df_factor_order[i]))
  title_size <- element_text(hjust = 0.5, face = "bold")
  legend_pos <- "right"
  print(plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos))
  cat("\n\n\\pagebreak\n")
}


}}}, error=function(e) message(e)
)
# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] >1){
  cat("### Pie Plots Summary", fill=TRUE)
  cat("Multiple Pie Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  } 
} 
tryCatch({
if (exists("df_factor")){
  if (dim(df_factor)[2] >1){

plots <- list()
for (i in 1:ncol(df_factor_order)){
  title <- substr(colnames(df_factor_order[i]), 1, 19)
  title_size <- element_text(size = 30 / min(25, ceiling(sqrt(ncol(df_factor_order)))),
                             face = "bold", hjust = 0.5)
  legend_pos <- "none"
  table <- freqtable(df_factor_order[,i])
  plots[[i]] <- plot_pie(table=table, title=title, title_size=title_size, legend_pos=legend_pos)
}

if (ncol(df_factor_order)>25){
  for(i in 1:ceiling(ncol(df_factor_order)/25)){
    index <- seq((i-1)*25+1,min(i*25,ncol(df_factor_order)), by=1)
    plots_2 <- plots[index]
    grid.arrange(grobs=plots_2, ncol=5)
  }
} else {
  grid.arrange(grobs = plots, ncol = ceiling(sqrt(ncol(df_factor_order))))
}

}}},  error=function(e) message(e)
)


Try the Statsomat package in your browser

Any scripts or data that you put into this service are public.

Statsomat documentation built on Nov. 17, 2021, 5:17 p.m.