Nothing
text <- "# Empty rows w.r.t. selected variables rows_drop = [" code <- paste0(text,paste0(paste(rows_drop-1, collapse = ','), "]")) codechunk(code, style=list("background-color"="#FFFFFF"))
# Drop empty rows df.drop(df.index[rows_drop], inplace = True)
text <- "# Column indices of selected continuous variables cols_continuous = [" indices <- which(colnames(df_code) %in% colnames(df_num)) indices <- indices-1 code <- paste0(text,paste0(paste(indices, collapse = ','), "]")) codechunk(code, style=list("background-color"="#FFFFFF"))
# Data frame of the continuous variables df_num = df.iloc[:,cols_continuous]
text <- "# Column indices of selected discrete variables cols_discrete = [" indices <- which(colnames(df_code) %in% colnames(df_factor)) indices <- indices-1 code <- paste0(text,paste0(paste(indices, collapse = ','), "]")) codechunk(code, style=list("background-color"="#FFFFFF"))
# Data frame of the discrete variables df_cat = df.iloc[:,cols_discrete]
# Continuous variables, descriptive statistics df_num.sort_index(axis=1, inplace=True, key=lambda x: x.str.lower()) df_num_stats = df_num.describe() df_num_stats = df_num_stats.transpose() df_num_stats.drop(columns=['25%', '75%'], inplace=True) df_num_stats.rename(columns={"count": "N Valid", "mean": "Mean", "50%": "Median", "std": "SD", "min": "Min", "max": "Max"}, inplace=True) df_num_stats["N Obs"] = df_num.shape[0] df_num_stats["N Missing"] = df_num.isna().sum() df_num_stats["% Complete"] = (df_num_stats["N Valid"] / df_num_stats["N Obs"])*100 df_num_stats["N Unique"] = df_num.nunique(dropna=True) df_num_stats["MAD"] = df_num.apply(lambda x: stats.median_abs_deviation(x, scale="normal", nan_policy='omit')) df_num_stats["Skewness"] = df_num.skew(axis=0) df_num_stats["Kurtosis"] = df_num.kurtosis(axis=0) df_num_stats.loc[df_num_stats["Min"]>0,"CV"] = df_num_stats["SD"] / df_num_stats["Mean"] df_num_stats = df_num_stats.round(2) df_num_stats.loc[df_num_stats["Min"]<=0,"CV"] = "" ## Reorder columns to be similar to the output of the R app df_num_stats = df_num_stats.reindex(columns=["N Obs", "N Missing", "N Valid", "% Complete", "N Unique", "Mean", "SD", "Median", "MAD", "Min", "Max", "Skewness", "Kurtosis", "CV"]) ## Restrict size of characters of index cnames = np.array(df_num_stats.index).astype('str') csize = min(np.max(np.char.str_len(cnames)),25) df_num_stats.index = list(map(lambda x: x[0:csize], cnames)) ## Output print(tabulate(df_num_stats, headers='keys', tablefmt="simple"))
# Function for Histogram def histo(ax, var, ylabel, xsize, title): sns.histplot(df_num.iloc[:,var], ax = ax, stat='density', color = '#2fa42d', kde = True) minvar = df_num.iloc[:,var].min() maxvar = df_num.iloc[:,var].max() xaxis = np.arange(minvar, maxvar, 0.1) fit_normal = stats.norm.pdf(xaxis, df_num.iloc[:,var].mean(), df_num.iloc[:,var].std()) ax.plot(xaxis, fit_normal, lw = 1, color = '#396e9f') ax.tick_params(axis='both', which='major', labelsize=10) ax.set_xlabel(df_num.columns[var], fontsize=xsize) ax.set_ylabel(ylabel) ax.set_title(title) # Large Histogram Plots sns.set_style() for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) histo(ax, var, "Density", 25, title="Histogram") plt.show()
# Function little helper to message the axes list to have correct length def trim_axs(axs, N): axs = axs.flat for ax in axs[N:]: ax.remove() return axs[:N]
# Summary Histogram Plots ## Set number of cols and rows nrow = math.ceil(np.sqrt(df_num.shape[1])) ncol = math.ceil(df_num.shape[1] / nrow) fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_num.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_num.shape[1])): histo(ax, var, "", 8, title="") plt.show()
# Function for Box-Plot def box(ax, var, xsize): sns.boxplot(x=df_num.iloc[:,var], ax = ax, color = '#2fa42d') ax.set_xlabel(df_num.columns[var], fontsize=xsize) title = "Box-Plot of " + str(df_num.columns[var]) ax.set_title(title) # Large Box-Plots sns.set_theme(style="whitegrid") for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) box(ax, var, 25) plt.show() plt.close(fig)
# Summary Box-Plots fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_num.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_num.shape[1])): box(ax, var, 5) plt.show()
# Function for ECDF Plot def ecdf(ax, var, xsize): sns.ecdfplot(df_num.iloc[:,var], ax = ax, color = '#2fa42d') minvar = df_num.iloc[:,var].min() maxvar = df_num.iloc[:,var].max() xaxis = np.arange(minvar, maxvar, 0.1) fit_normal = stats.norm.cdf(xaxis, df_num.iloc[:,var].mean(), df_num.iloc[:,var].std()) ax.plot(xaxis, fit_normal, lw = 1, color = '#396e9f') ax.set_xlabel(df_num.columns[var], fontsize=xsize) ax.set_ylabel("ECDF") # Large ECDF Plots for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) ecdf(ax, var, 15) plt.show()
# ECDF Plots Summary fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_num.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_num.shape[1])): ecdf(ax, var, 10) plt.show()
# Function for QQ Plot def qqplotf(var, ax, xlabel, ylabel, title): var_dropna = df_num.iloc[:,var].dropna() sm.qqplot(var_dropna, fit=True, ax = ax, line = '45') ax.set_xlabel(xlabel) ax.set_ylabel(ylabel + df_num.columns[var]) ax.set_title(title) ax.get_lines()[0].set_markerfacecolor('#2fa42d') ax.get_lines()[0].set_markeredgecolor('#2fa42d') # QQ-Plots Large for var in range(df_num.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) qqplotf(var, ax, "Normal Quantiles", "Sample Quantiles for ", "QQ-Plot") plt.show()
## QQ Plots Summary fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_num.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_num.shape[1])): qqplotf(var, ax, "", "", "") plt.show()
# Categorical Variables ## Table Totals nobs = np.ones(df_cat.shape[1])*df_cat.shape[0] nobs = nobs.astype(int) df_cat.sort_index(axis=1, inplace=True, key=lambda x: x.str.lower()) df_cat = df_cat.astype("string") df_cat_stats = pd.DataFrame(nobs, columns=['N Obs'], index=df_cat.columns) df_cat_stats["N Missing"] = df_cat.isna().sum() df_cat_stats["N Valid"] = df_cat.count() df_cat_stats["% Complete"] = (df_cat_stats["N Valid"] / df_cat_stats["N Obs"])*100 df_cat_stats["N Unique"] = df_cat.nunique(dropna=False) df_cat_stats = df_cat_stats.round(2) ### Restrict size of characters of index cnames = np.array(df_cat_stats.index).astype('str') csize = min(np.max(np.char.str_len(cnames)),25) df_cat_stats.index = list(map(lambda x: x[0:csize], cnames)) ### Output print(tabulate(df_cat_stats, headers='keys', tablefmt="simple"))
# Categorical Variables ## Table Frequencies ### Data preparation df_cat_miss = df_cat.fillna('Missing') df_cat_miss = df_cat_miss.applymap(lambda x: x[0:19]) ### Function to compute freq per variable def freq(var): # Compute Frequency and Percent columns x = df_cat_miss[var].value_counts(sort = True), y = df_cat_miss[var].value_counts(normalize = True, sort = True) xdf = pd.DataFrame(x).transpose() ydf = pd.DataFrame(y) xdf.rename(columns = {xdf.columns[0]: "Frequency"}, inplace=True) ydf.rename(columns = {ydf.columns[0]: "Percent"}, inplace=True) xdf["Category"] = xdf.index ydf["Category"] = ydf.index # Merge columns and define output df df = xdf.merge(ydf, on = "Category") df["Variable"] = var df = df[["Variable", "Category", "Frequency", "Percent"]] # Only the first 20 categories df = df.head(n=20) # Sort also by Category df.sort_values(by=['Frequency', 'Category'], ascending = {False, True}, inplace=True) return(df) # Initialize list of dfs listdf = [] for var in df_cat_miss.columns: listdf.append(freq(var)) df_freq = pd.concat(listdf, ignore_index=True) print(tabulate(df_freq, headers='keys', tablefmt="simple", showindex=False))
# Function for Bar-Plots def barplot(var, ax, type): sns.countplot(x = df_cat_miss.columns[var], ax = ax, color = '#2fa42d', data = df_cat_miss, order = df_cat_miss.iloc[:,var].value_counts(ascending = True).index) # Rotate labels if df_cat_miss.iloc[:,var].nunique() in range(10,41): ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right") # No labels for variables with ncats > 40 or type summary if df_cat_miss.iloc[:,var].nunique() > 40 or type=="Summary": ax.set_xticklabels([]) ax.set_xlabel(df_cat_miss.columns[var]) ax.set_ylabel("Frequency") title = "Bar-Plot of " + str(df_cat_miss.columns[var]) ax.set_title(title) # Bar-Plots Large for var in range(df_cat_miss.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) barplot(var, ax, type="Large") plt.show()
# Bar-Plots Summary ## Set shape of rows and columns nrow = math.ceil(np.sqrt(df_cat.shape[1])) ncol = math.ceil(df_cat.shape[1] / nrow) fig, ax = plt.subplots(nrow, ncol) fig.tight_layout(pad=3.0) ax = trim_axs(ax, df_cat.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_cat.shape[1])): barplot(var, ax, type="Summary") plt.show()
# Function for Pie-Plots def pieplot(var,ax, titlesize, textsize, radius): cats = df_cat_miss.iloc[:,var].value_counts() labels = cats.index ax.pie(x = cats, autopct="%.1f%%", explode=[0]*len(labels), labels = labels, pctdistance=0.7, textprops={'fontsize': textsize}, shadow = True, radius = radius) ax.set_title("Pie-Plot of " + df_cat_miss.columns[var], fontsize = titlesize, pad=10) # Pie-Plots Large for var in range(df_cat_miss.shape[1]): fig, ax = plt.subplots(nrows = 1, ncols=1) pieplot(var, ax, titlesize=14, textsize=8, radius=1) plt.show()
# Pie-Plots Summary plt.rcParams.update(plt.rcParamsDefault) fig, ax = plt.subplots(nrow, ncol, sharex=True, sharey=True, figsize=(10,10)) ax = trim_axs(ax, df_cat.shape[1]) fig.subplots_adjust(hspace = 1) for ax, var in zip(ax.flatten(), range(df_cat.shape[1])): pieplot(var, ax, titlesize=7, textsize=6, radius=1.2) plt.subplots_adjust(hspace=0.3, wspace=0.01) plt.show()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.