Exploratory Data Analysis with Python"

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

library(reticulate)
virtualenv_create('shiny-app-env', python= 'python3')
use_virtualenv('shiny-app-env', required = T)
virtualenv_install('shiny-app-env', packages = c('seaborn'))
virtualenv_install('shiny-app-env', packages = c('scipy'))
virtualenv_install('shiny-app-env', packages = c('tabulate'))
virtualenv_install('shiny-app-env', packages = c('statsmodels'))
library(knitr) 

# Get data
df <- params$data
df_code <- df

# Initialize further chunks
eval0 <- FALSE
eval <- FALSE
eval_num <- FALSE
eval_num2 <- FALSE
eval_factor <- FALSE
eval_factor2 <- FALSE

tryCatch({

  df <- df[,params$vars1,drop=FALSE]
  df2 <- df

  # Initialize next computations
  eval0 <- TRUE

}, error=function(e) {

  stop(safeError("Please try other column names for the following columns: "))
}

)

if (length(setdiff(params$vars1,colnames(df))) >0) {
  equal <- intersect(colnames(df),params$vars1)
  kable(setdiff(params$vars1,equal),col.names = "Column")
}
# Initialize next computations
eval <- FALSE
eval_rows <- FALSE

tryCatch({

# Drop columns if all observations are missing 
col_names_missing <- sapply(df, function(col) all(is.na(col)))
df[ ,col_names_missing] <- list(NULL)
df_list <- df 

# Drop empty rows
rowsums <- data.frame(sapply(df,is.na))
if (length(which(rowSums(rowsums) == dim(df)[2])) != 0L){
  eval_rows <- TRUE
  rows_drop <- (which(rowSums(rowsums) == dim(df)[2]))
  length_non_complete <- length(which(rowSums(rowsums) == dim(df)[2]))
  df <- df[-rows_drop, ,drop=FALSE]
}

# Convert logical variables to character
cols_logical <- sapply(df, function(col) is.logical(col))
df[ ,cols_logical] <- sapply(df[ ,cols_logical], as.character)

# Convert numerical variables with less than 7 unique values to character (missing values omitted)
col_names_numeric <- sapply(df, function(col) length(unique(na.omit(col))) < 7L & is.numeric(col))
df[ ,col_names_numeric] <- sapply(df[ ,col_names_numeric], as.character)

# Extract numerical variables 
df_num <- df[which(sapply(df, is.numeric) == 1L)]


# Extract approximate continuous variables and non-continuous var
if (ncol(df_num)>0){

  rateunique_df <- sapply(df_num, function(col) continuous(col))

  if (params$continuity == "severe"){
    df_cont <- df_num[,rateunique_df,drop=FALSE] # numeric, continuous resp. assumption fulfilled 
    df_noncont <- df_num[,!rateunique_df,drop=FALSE] # numeric, non-continuous 

  } else {
    df_cont <- df_num 
  }

} else {
   df_cont <- df_num 
}

# Extract character variables 
df_factor <- df[which(sapply(df, is.character) == 1L)]

# Categorical 
if (exists("df_noncont")){
  df_cat <- merge(df_factor, df_noncont, by="row.names")
  df_cat$Row.names <- NULL
  df_cat$Row.names.y <- NULL
} else {
  df_cat <- df_factor
}

# Sort by variable name 
df_cont <- df_cont[,order(colnames(df_cont)),drop=FALSE] 
df_cat <- df_cat[,order(colnames(df_cat)),drop=FALSE] 

# Initialize next computations
eval <- TRUE

}, error=function(e) {

  stop(safeError("Dataset cannot be prepared. Try other upload settings and check your data for compatibility reasons."))

}

)
# Call used libraries 
library(kableExtra)

# Chunk with first page of basic information
cat("\n# Basic Information", fill=TRUE)
cat("\\small ", fill=TRUE)
cat("Automatic statistics for the file:", fill=TRUE)
dataname <- params$filename[1]
kable(dataname, col.names = "File", linesep = '', longtable=T) 

cat("Your selection for the encoding:", fill=TRUE)
if (params$fencoding=="unknown"){
  cat("Auto")
} else {cat("UTF-8")}
cat("\\newline",fill=TRUE) 

cat("Your selection for the decimal character:", fill=TRUE)
if (params$decimal=="auto"){
  cat("Auto")
} else {cat(params$decimal)}
cat("\\newline",fill=TRUE) 

cat("Observations (rows with at least one non-missing value): ", fill=TRUE)
cat(dim(df)[1])
cat("\\newline",fill=TRUE) 

# Missing rows
if (exists("length_non_complete")){
  cat("Number of rows that are dropped because they contain no values (all values are missing):", length_non_complete)
  cat("\\newline",fill=TRUE) 
}

cat("Variables (columns with at least one non-missing value): ", fill=TRUE)
cat(dim(df_list)[2])
cat("\\newline",fill=TRUE) 


# Missing columns
if (exists("col_names_missing")){
  if (sum(col_names_missing) != 0L){
    cat("Number of columns that are dropped because they contain no values (all values are missing):", sum(col_names_missing), fill=TRUE)
    cat("\\newline",fill=TRUE) 
  } 
}


if (exists("df_cont")){
  cat("Variables considered continuous: ", fill=TRUE)
  if (ncol(df_cont)>0){
    cat(ncol(df_cont),fill=TRUE)
    knitr::kable(colnames(df_cont), col.names = "Variables considered continuous", linesep = '', longtable=T) %>%
      kable_styling(font_size = 8, position = "center", full_width = FALSE, latex_options = c("HOLD_position","repeat_header"))
  } else {
    cat("0", fill=TRUE)
    cat("\\newline",fill=TRUE) 
  }
}


if (exists("df_cat")){
  cat("Variables considered categorical: ", fill=TRUE)
  if (ncol(df_cat)>0){
    cat(ncol(df_cat),fill=TRUE)
    knitr::kable(colnames(df_cat), col.names = "Variables considered categorical", linesep = '', longtable=T) %>%
      kable_styling(font_size = 8, position = "center", full_width = FALSE, latex_options = c("HOLD_position","repeat_header"))
  } else {
    cat("0", fill=TRUE)
    cat("\\newline",fill=TRUE) 
  }
}
# Numeric falsly to char? 
check_reading <- function(col){
  numeric <- !is.na(as.numeric(col))
  return(sum(numeric)/sum(!is.na(col)))
}

df_char2 <- df2[which(sapply(df2, is.character) == 1L)]
numeric_percent <- sapply(df_char2, function(col) check_reading(col))

if (length(numeric_percent[(numeric_percent>0.9)]) != 0L){
  cat("**Warning: More than 90% of the values of these columns could be treated as numeric. Nevertheless, because of some values or the selected decimal character, the columns must be treated as discrete. Are all the values plausible? Please check the data once more before uploading! Column(s):**", names(numeric_percent[(numeric_percent>0.9)]), fill=TRUE)
}

\pagebreak

# Copy
if (exists("df_cont")) df_num <- df_cont
if (exists("df_cat")) df_factor <- df_cat
# Send df to Python 
## Call Python environment 
main <- import_main()
## Replace NA with NaN to be converted correctly by reticulate 
df_num[is.na(df_num)] <- NaN
df_cat[is.na(df_cat)] <- NaN
## Add data frames to Python environment 
if (exists("df_cont")) main$df_num <- r_to_py(df_num)
if (exists("df_cat")) main$df_cat <- r_to_py(df_cat)

```{python, eval=eval, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, results="asis"} import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy import stats import math import statsmodels.api as sm from tabulate import tabulate import sys if not sys.warnoptions: import warnings warnings.simplefilter("ignore")

Set console printing options

pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 20) pd.set_option('display.width', 1000)

```r
# Title 
if (exists("df_num")){
  if (dim(df_num)[2] != 0L){
    eval_num <- TRUE
    cat("# Results for Numerical Variables", fill=TRUE)
    cat("## Descriptive Statistics", fill=TRUE)
    cat("Variables are sorted alphabetically. Missings are omitted in stats. CV only for positive variables. ", fill=TRUE)

  } 
}

```{python, eval=eval_num, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, results="asis"}

Continuous variables

Descriptive statistics

Sort by variable name

df_num.sort_index(axis=1, inplace=True, key=lambda x: x.str.lower()) df_num_stats = df_num.describe() df_num_stats = df_num_stats.transpose() df_num_stats.drop(columns=['25%', '75%'], inplace=True) df_num_stats.rename(columns={"count": "N Valid", "mean": "Mean", "50%": "Median", "std": "SD", "min": "Min", "max": "Max"}, inplace=True) df_num_stats["N Obs"] = df_num.shape[0] df_num_stats["N Missing"] = df_num.isna().sum() df_num_stats["% Complete"] = (df_num_stats["N Valid"] / df_num_stats["N Obs"])*100 df_num_stats["N Unique"] = df_num.nunique(dropna=True) df_num_stats["MAD"] = df_num.apply(lambda x: stats.median_abs_deviation(x, scale="normal", nan_policy='omit'))
df_num_stats["Skewness"] = df_num.skew(axis=0) df_num_stats["Kurtosis"] = df_num.kurtosis(axis=0) df_num_stats.loc[df_num_stats["Min"]>0,"CV"] = df_num_stats["SD"] / df_num_stats["Mean"] df_num_stats = df_num_stats.round(2) df_num_stats.loc[df_num_stats["Min"]<=0,"CV"] = ""

Reorder columns to be similar to the output of the R app

df_num_stats = df_num_stats.reindex(columns=["N Obs", "N Missing", "N Valid", "% Complete", "N Unique", "Mean", "SD", "Median", "MAD", "Min", "Max", "Skewness", "Kurtosis", "CV"])

Restrict size of characters of index

cnames = np.array(df_num_stats.index).astype('str') csize = min(np.max(np.char.str_len(cnames)),25) df_num_stats.index = list(map(lambda x: x[0:csize], cnames))

Output

print(tabulate(df_num_stats, headers='keys', tablefmt="latex_longtable"))

\pagebreak


```r
if (exists("df_num")){
  if (dim(df_num)[2] > 1){
    cat("## Graphics", fill=TRUE)
    cat("### Histograms", fill=TRUE)
    if(dim(df_num)[2]> 1){
      cat("Details: Density Histograms. One large figure per page for each variable, sorted alphabetically. The blue line represents the normal density approximation. The green line represents a special kernel density approximation.  ", fill=TRUE)
      cat("\\newline", fill=TRUE)
      cat("See figures on next page. ", fill=TRUE)
    }
  } 
}
if (exists("df_num")){
  if (dim(df_num)[2] == 1){
    cat("## Graphics", fill=TRUE)
    cat("### Histograms", fill=TRUE)
    if(dim(df_num)[2]==1){
      cat("Details: Density Histogram. The blue line represents the normal density approximation. The green line represents a special kernel density approximation.  ", fill=TRUE)
    }
  } 
}

```{python histolarge, eval=eval_num, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

Histograms Large

def histo(ax, var, ylabel, xsize, title): sns.histplot(df_num.iloc[:,var], ax = ax, stat='density', color = '#2fa42d', kde = True) minvar = df_num.iloc[:,var].min() maxvar = df_num.iloc[:,var].max() xaxis = np.arange(minvar, maxvar, 0.1) fit_normal = stats.norm.pdf(xaxis, df_num.iloc[:,var].mean(), df_num.iloc[:,var].std())
ax.plot(xaxis, fit_normal, lw = 1, color = '#396e9f') ax.tick_params(axis='both', which='major', labelsize=10) ax.set_xlabel(df_num.columns[var], fontsize=xsize) ax.set_ylabel(ylabel) ax.set_title(title)

sns.set_style() for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) histo(ax, var, "Density", 25, title="Histogram") plt.show() if (var==0): plt.plot()

\pagebreak

```r
# Title
if (exists("df_num")){
  if (dim(df_num)[2] >1){
    eval_num2 <- TRUE
    cat("### Histograms Summary", fill = TRUE)
    cat("Multiple Relative Frequency Histogram in one figure. Variables are sorted alphabetically. The blue line represents the normal density approximation. The green line represents a special kernel density approximation.  ", fill=TRUE)
  } 
}

```{python, eval=eval, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

Little helper to message the axs list to have correct length

def trim_axs(axs, N): axs = axs.flat for ax in axs[N:]: ax.remove() return axs[:N]

```{python, eval=eval_num2, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}
## Histograms Summary (tested for ncols <=25)
### Define number of columns and rows for the multi-plot 
nrow = math.ceil(np.sqrt(df_num.shape[1]))
ncol = math.ceil(df_num.shape[1] / nrow)
fig, ax = plt.subplots(nrow, ncol)
fig.tight_layout(pad=3.0)
ax = trim_axs(ax, df_num.shape[1])
fig.subplots_adjust(hspace = 1)
for ax, var in zip(ax.flatten(), range(df_num.shape[1])):
   histo(ax, var, "", 8, title="")
plt.show()

\pagebreak

if (exists("df_num")){
  if (dim(df_num)[2] != 0L){
    cat("### Box-Plots", fill=TRUE)
    if(dim(df_num)[2]>1){
      cat("One Box-Plot per page for each variable. Variables are sorted alphabetically.  ", fill=TRUE)
    }
  } 
}

```{python, eval=eval_num, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

Box-Plots Large

def box(ax, var, xsize): sns.boxplot(x=df_num.iloc[:,var], ax = ax, color = '#2fa42d') ax.set_xlabel(df_num.columns[var], fontsize=xsize) title = "Box-Plot of " + str(df_num.columns[var]) ax.set_title(title)

sns.set_theme(style="whitegrid") for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) box(ax, var, 25) plt.show() if (var==0): plt.plot() plt.close(fig)

\pagebreak


```r
# Title
if (exists("df_num")){
  if (dim(df_num)[2] >1){
  cat("### Box-Plots Summary", fill=TRUE)
  cat("Multiple Box-Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  } 
} 

```{python, eval=eval_num2, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

Box-Plots Summary (tested for ncols <=25)

Define number of columns and rows for the multi-plot

fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_num.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_num.shape[1])): box(ax, var, 15) plt.show()

\pagebreak


```r
if (exists("df_num")){
  if (dim(df_num)[2]==1){
    cat("### ECDF Plots", fill=TRUE)
    cat("ECDF (Empirical Cumulative Distribution Function) Plot.  The blue line represents the CDF of a normal distribution. If the variable is normally distributed, the blue line approximates well the ECDF.  ", fill=TRUE)
  } else if (dim(df_num)[2]>1){
    cat("### ECDF Plots", fill=TRUE)
    cat(" One ECDF (Empirical Cumulative Distribution Function) Plot per page for each variable. Variables are sorted alphabetically. The blue line represents the CDF of a normal distribution. If the variable is normally distributed, the blue line approximates well the ECDF.  ", fill=TRUE)
  }
}

```{python, eval=eval_num, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

ECDF Large

def ecdf(ax, var, xsize): sns.ecdfplot(df_num.iloc[:,var], ax = ax, color = '#2fa42d') minvar = df_num.iloc[:,var].min() maxvar = df_num.iloc[:,var].max() xaxis = np.arange(minvar, maxvar, 0.1) fit_normal = stats.norm.cdf(xaxis, df_num.iloc[:,var].mean(), df_num.iloc[:,var].std())
ax.plot(xaxis, fit_normal, lw = 1, color = '#396e9f') ax.set_xlabel(df_num.columns[var], fontsize=xsize) ax.set_ylabel("ECDF")

for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) ecdf(ax, var, 15) plt.show() if (var==0): plt.plot()

\pagebreak


```r
# Title
if (exists("df_num")){
  if (dim(df_num)[2] > 1){
  cat("### ECDF Plots Summary", fill=TRUE)
  cat("Multiple ECDF Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  } 
} 

```{python, eval=eval_num2, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

ECDF Plots Summary (tested for ncols <=25)

fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_num.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_num.shape[1])): ecdf(ax, var, 10) plt.show()

\pagebreak


```r
if (exists("df_num")){
  if (dim(df_num)[2]==1){
    cat("### QQ-Plots", fill=TRUE)
  } else if (dim(df_num)[2]>1){
    cat("### QQ-Plots", fill=TRUE)
    cat("One QQ-Plot per page for each variable. Variables are sorted alphabetically.  ", fill=TRUE)
  }
}

```{python, eval=eval_num, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

QQ Large

def qqplotf(var, ax, xlabel, ylabel, title): var_dropna = df_num.iloc[:,var].dropna() sm.qqplot(var_dropna, fit=True, ax = ax, line = '45') ax.set_xlabel(xlabel) ax.set_ylabel(ylabel + df_num.columns[var]) ax.set_title(title) ax.get_lines()[0].set_markerfacecolor('#2fa42d') ax.get_lines()[0].set_markeredgecolor('#2fa42d')

for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) qqplotf(var, ax, "Normal Quantiles", "Sample Quantiles for ", "QQ-Plot") plt.show() if (var==0): plt.plot()

\pagebreak


```r
# Title
if (exists("df_num")){
  if (dim(df_num)[2] >1){
  cat("### QQ-Plots Summary", fill=TRUE)
  cat("Multiple QQ-Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  } 
} 

```{python, eval=eval_num2, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

QQ Plots Summary (tested for ncols <=25)

fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_num.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_num.shape[1])): qqplotf(var, ax, "", "", "") plt.show()

\pagebreak


```r
# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
    eval_factor <- TRUE
    cat("# Results for Discrete Variables", fill=TRUE)
    cat("## Descriptive Statistics", fill=TRUE)
    cat("### Totals", fill = TRUE)
    cat("The table is sorted by the variable name. If any, N Unique contains the missing category.", fill=TRUE)
   } 
}

```{python, eval=eval_factor, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8), results="asis"}

Categorical Variables

Table Totals

nobs = np.ones(df_cat.shape[1])df_cat.shape[0] nobs = nobs.astype(int) df_cat.sort_index(axis=1, inplace=True, key=lambda x: x.str.lower()) df_cat = df_cat.astype("string") df_cat_stats = pd.DataFrame(nobs, columns=['N Obs'], index=df_cat.columns) df_cat_stats["N Missing"] = df_cat.apply(lambda x: (x == 'NaN').sum() + x.isna().sum()) df_cat_stats["N Valid"] = df_cat_stats["N Obs"] - df_cat_stats["N Missing"] df_cat_stats["% Complete"] = (df_cat_stats["N Valid"] / df_cat_stats["N Obs"])100 df_cat_stats["N Unique"] = df_cat.nunique(dropna=False) df_cat_stats = df_cat_stats.round(2)

Restrict size of characters of index

cnames = np.array(df_cat_stats.index).astype('str') csize = min(np.max(np.char.str_len(cnames)),25) df_cat_stats.index = list(map(lambda x: x[0:csize], cnames))

Table

print(tabulate(df_cat_stats, headers='keys', tablefmt="latex_longtable"))

\pagebreak

```r
# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
    cat("### Frequencies", fill = TRUE)
    cat("The table is sorted by the variable name. For each variable, a maximum of 20 unique values are considered, sorted in decreasing order of their frequency. If any, missings are counted as a category. ", fill=TRUE)
   } 
}

```{python, eval=eval_factor, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8), results="asis"}

Categorical Variables

Table Frequencies

Data preparation

df_cat_miss = df_cat.fillna('Missing') df_cat_miss.replace(to_replace='NaN', value='Missing', inplace=True)

Restrict to first characters

df_cat_miss = df_cat_miss.applymap(lambda x: x[0:19])

Function to compute freq per variable

def freq(var):

# Compute Frequency and Percent columns
x = df_cat_miss[var].value_counts(sort = True), 
y = df_cat_miss[var].value_counts(normalize = True, sort = True)

xdf = pd.DataFrame(x).transpose()
ydf = pd.DataFrame(y)

xdf.rename(columns = {xdf.columns[0]: "Frequency"}, inplace=True)
ydf.rename(columns = {ydf.columns[0]: "Percent"}, inplace=True)

xdf["Category"] = xdf.index 
ydf["Category"] = ydf.index

# Merge columns and define output df
df = xdf.merge(ydf, on = "Category")
df["Variable"] = var
df = df[["Variable", "Category", "Frequency", "Percent"]]

# Only the first 20 categories
df = df.head(n=20)

# Sort also by Category
df.sort_values(by=['Frequency', 'Category'], ascending = {False, True}, inplace=True)

return(df)

Initialize list of dfs

listdf = [] for var in df_cat_miss.columns: listdf.append(freq(var))

df_freq = pd.concat(listdf, ignore_index=True) print(tabulate(df_freq, headers='keys', tablefmt="latex_longtable", showindex=False))

\pagebreak


```r
# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
  cat("## Graphics", fill=TRUE)
  cat("### Bar-Plots", fill=TRUE)
  if(dim(df_factor)[2]>1){
    cat("One Bar-Plot per page for each variable. Variables are sorted alphabetically. No labels for variables with more than 40 categories.", fill=TRUE)
    eval_factor2 <- TRUE 
  }
  } 
}

```{python, eval=eval_factor, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

Categorical Variables

Bar-Plot Large

def barplot(var, ax, type): sns.countplot(x = df_cat_miss.columns[var], ax = ax, color = '#2fa42d', data = df_cat_miss, order = df_cat_miss.iloc[:,var].value_counts(ascending = True).index)

# Rotate labels 
if df_cat_miss.iloc[:,var].nunique() in range(10,41):
  ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")

# No labels for variables with ncats > 40 or type summary
if df_cat_miss.iloc[:,var].nunique() > 40 or type=="Summary":
  ax.set_xticklabels([])

ax.set_xlabel(df_cat_miss.columns[var])
ax.set_ylabel("Frequency") 
title = "Bar-Plot of " + str(df_cat_miss.columns[var])
ax.set_title(title)

for var in range(df_cat_miss.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) barplot(var, ax, type="Large") plt.show() if (var==0): plt.plot()

\pagebreak


```r
# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] >1){
  cat("### Bar-Plots Summary")
  cat("\n\n\ Multiple Bar-Plots of variables in one figure. Variables are sorted alphabetically. No labels displayed. ", fill=TRUE)
  } 
} 

```{python, eval=eval_factor2, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

Bar-Plots Summary (tested for ncols <=25)

nrow = math.ceil(np.sqrt(df_cat.shape[1])) ncol = math.ceil(df_cat.shape[1] / nrow) fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_cat.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_cat.shape[1])): barplot(var, ax, type="Summary") plt.show()

\pagebreak


```r
if (exists("df_factor")){
  if (dim(df_factor)[2] != 0L){
    cat("### Pie Plots", fill=TRUE)
    if(dim(df_factor)[2]>1){
      cat("One Pie Plot per page for each variable. Variables are sorted alphabetically.  ", fill=TRUE)
    }
  } 
}

```{python, eval=eval_factor, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,8)}

Pie-Plot Large

def pieplot(var,ax, titlesize, textsize, radius): cats = df_cat_miss.iloc[:,var].value_counts() labels = cats.index ax.pie(x = cats, autopct="%.1f%%", explode=[0]*len(labels), labels = labels, pctdistance=0.7, textprops={'fontsize': textsize}, shadow = True, radius = radius) ax.set_title("Pie-Plot of " + df_cat_miss.columns[var], fontsize = titlesize, pad=10)

for var in range(df_cat_miss.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) pieplot(var, ax, titlesize=14, textsize=8, radius=1) plt.show() if (var==0): plt.plot()

\pagebreak


```r
# Title
if (exists("df_factor")){
  if (dim(df_factor)[2] >1){
  cat("### Pie Plots Summary", fill=TRUE)
  cat("Multiple Pie Plots of variables in one figure. Variables are sorted alphabetically.  ", fill=TRUE)
  cat("\\newline", fill=TRUE)
  cat("See figures on next page. ", fill=TRUE)
  } 
} 

```{python, eval=eval_factor2, echo = FALSE, comment='', message = FALSE, error = TRUE, warning=FALSE, fig.dim = c(14,7)}

Pie-Plots Summary (tested for ncols <=25)

plt.rcParams.update(plt.rcParamsDefault) fig, ax = plt.subplots(nrow, ncol, sharex=True, sharey=True, figsize=(10,10)) ax = trim_axs(ax, df_cat.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_cat.shape[1])): pieplot(var, ax, titlesize=7, textsize=6, radius=1.2) plt.subplots_adjust(hspace=0.3, wspace=0.01) plt.show() ```



Try the Statsomat package in your browser

Any scripts or data that you put into this service are public.

Statsomat documentation built on Nov. 17, 2021, 5:17 p.m.