knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) packages = c('funModeling', 'haven', 'corrplot', 'tidyverse') for (p in packages){ if(!require(p, character.only = T)){ install.packages(p) } library(p, character.only = T) }
HAVEN Data Profiling
# Read the test file into a variable my_test <- read.csv('example_data_set.csv', header=TRUE) # Transform columns to dates my_test$DissolutionDate <- as.Date(my_test$DissolutionDate, format = "%d/%m/%y") my_test$IncorporationDate <- as.Date(my_test$IncorporationDate, format = "%d/%m/%y") my_test$Accounts.NextDueDate <- as.Date(my_test$Accounts.NextDueDate, format = "%d/%m/%y") my_test$Accounts.LastMadeUpDate <- as.Date(my_test$Accounts.LastMadeUpDate, format = "%d/%m/%y") my_test$Returns.NextDueDate <- as.Date(my_test$Returns.NextDueDate, format = "%d/%m/%y") my_test$Returns.LastMadeUpDate <- as.Date(my_test$Returns.LastMadeUpDate, format = "%d/%m/%y") my_test$ConfStmtNextDueDate <- as.Date(my_test$ConfStmtNextDueDate, format = "%d/%m/%y") my_test$ConfStmtLastMadeUpDate <- as.Date(my_test$ConfStmtLastMadeUpDate, format = "%d/%m/%y") # Glimpse comes from the dply package glimpse(my_test) # Remove some extraneous previous name fields my_test <- my_test %>% select(-PreviousName_2.CONDATE, -PreviousName_2.CompanyName, -PreviousName_3.CONDATE, -PreviousName_3.CompanyName, -PreviousName_4.CONDATE, -PreviousName_4.CompanyName, -PreviousName_5.CONDATE, -PreviousName_5.CompanyName, -PreviousName_6.CONDATE, -PreviousName_6.CompanyName, -PreviousName_7.CONDATE, -PreviousName_7.CompanyName, -PreviousName_8.CONDATE, -PreviousName_8.CompanyName, -PreviousName_9.CONDATE, -PreviousName_9.CompanyName, -PreviousName_10.CONDATE, -PreviousName_10.CompanyName) # Changes blank fields to NA my_test[my_test==""] <- NA # Data Integrity data_integrity(my_test) # Descriptive Statistics summary(my_test) # Profile_num # Get a metric table with many indicators for all numerical variables, # automatically skipping the non-numerical variables. # Current metrics are: mean, std_dev: standard deviation, # all the p_XX: percentile at XX number, skewness, kurtosis, # iqr: inter quartile range, # variation_coef: the ratio of sd/mean, range_98 is the limit for which the 98 profiling_num(my_test) # ggplot2 histogram of Accounts.AccountRefDay ggplot(my_test, aes(x=Accounts.AccountRefDay)) + geom_histogram() # Use plot_num to profile continuous variables plot_num(my_test, bins = 15)
Change the options here to NOT include the code
freq(my_test, input = c('RegAddress.Country','CompanyCategory', 'CompanyStatus', 'CountryOfOrigin', 'Accounts.AccountCategory'))
# Probability Density plotar(my_test, target = 'CompanyStatus', plot_type = "histdens")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.