knitr::opts_chunk$set(echo = FALSE, warning = FALSE)
library(ggplot2)
about: time-variables, modeled - variables, imputed-variables
For this report, a large sample from the wage data set is used, for speed and memory reasons but this can be easily run for the whole set.
ch <- RODBC::odbcDriverConnect("DRIVER={SQL Server}; SERVER=XXXXXX; trusted_connection=true") df0 <- RODBC::sqlQuery(ch,"SELECT * FROM ZZZZZZZZZZZ where sector='B' ", as.is=TRUE) close(ch) if (nrow(df0) > 30000) { df <- df0[sample(rownames(df0), size=30000), ] } rm(df0) micro <- df dnames <-names(DataExplorer::split_columns(micro)$discrete) cnames <- names(DataExplorer::split_columns(micro)$continuous) ref_year <- 2000 #add a variable for the logarithm of wages micro <-dplyr::mutate(micro,logwage=log(wage)) micro <- dplyr::mutate(micro,time_arbitr=((year_record-ref_year)*12+month_record) ) micro <- micro[which(micro$eduShort < 8 & micro$eduShort >0),] micro$eduShort[micro$eduShort < 3 ] <- "a_low" ### reference level micro$eduShort[micro$eduShort >=3 & micro$eduShort <=5 ] <- "low" micro$eduShort[micro$eduShort == 6 ] <- "med" micro$eduShort[micro$eduShort >=7 & micro$eduShort <=8 ] <- "vhigh" df <- dplyr::select(micro, -c(KT, MODUR_KFY, weight1, weight0, work_regime, month_record, wageMod, year_record, fullvin, sector))
ggplot2, DataExplorer, funModeling, tabplot, forecast, tsfeatures, anomalous
#view_data(df) dnames <-names( DataExplorer::split_columns(df)$discrete) cnames <- names( DataExplorer::split_columns(df)$continuous) #short overview Hmisc::describe(df) # data univariate plots DataExplorer::plot_intro(df) DataExplorer::plot_missing(df) #DataExplorer::plot_bar(df, maxcat=450) funModeling::freq(df) #DataExplorer::plot_histogram(df) funModeling::plot_num(df) # table plots from tabplot package, for even smaller sample: if (nrow(df) > 10000) { df_short <- df[sample(rownames(df), size=10000), ] } ### set this as on option whih could be adjusted #plot_list <- lapply(cnames, FUN=function(x0) { tabplot::tableplot(dat=df_short, sortCol=x0)$plot } )
lapply(cnames, FUN=function(var) { ggplot2::ggplot(df, ggplot2::aes(df[[var]])) + ggplot2::stat_ecdf(geom = "point") + ggplot2::xlab(var) + ggplot2::ylab("cumulative prob") } )
Note that: printing is not yet "addapted" to size of data and paper
# warning=FALSE and message=FALSE ggplot2::theme_set(ggplot2::theme_bw()) df_red <- dplyr::select(df, logwage, totexper, eduCode, KYN, occup, ageM) dnames1 <-names(DataExplorer::split_columns(df_red)$discrete) cnames1 <- names(DataExplorer::split_columns(df_red)$continuous) lapply(dnames1, FUN=function(varr) { GGally::ggpairs( df_red , ggplot2::aes( colour = df_red[[varr]] ), cardinality_threshold = 50 ) } ) ## ------ most general correlation plots: for any type of var # corrr::correlate(df) is an other option # printing is not perfect DataExplorer::plot_correlation( data=df_red, type = "all", maxcat = 50L, cor_args = list(), title = NULL, ggtheme = theme_gray(), theme_config = list(legend.position ="bottom", axis.text.x = element_text(angle=90)))
# qqplots of continuous variables ------------------ DataExplorer::plot_qq(df_red) # boxplots by each discrete ----------------- # lapply(dnames1, FUN=function(varr) { DataExplorer::plot_boxplot( df_red, by=varr , geom_boxplot_args = list("outlier.color"="red")) } ) # univar limits, using Tukey (interquartiles) knitr::kable( lapply(cnames1, FUN=function(x0) { c( x0, funModeling::tukey_outlier(as.data.frame(df_red)[[x0]]) ) } ) , format="markdown", col.names = " " ) # univar limits, using Hampel (median based ) knitr::kable( lapply(cnames1, FUN=function(x0) { c( x0, funModeling::hampel_outlier(df_red[[x0]]) ) } ) , format="markdown", col.names = " " ) # more tests for outliers #------ one score for each record, for each variable # too long. To find a representation of this! #lapply(cnames, FUN=function(x0){ #which( # outliers::scores(df[[x0]], type="t") == TRUE # ) ## other options ## outliers::scores(df[[x0]], type="z") ## outliers::scores(df[[x0]], type="iqr") # } # ) #----- comparing many methods, using package "OutliersO3" --------- # it works on continuous variables # comparing two methods # could add four more, at least: "BAC", "adjOut", "DDC, "MCD" # but this is rather slow even with two #O3m <- OutliersO3::O3prep(df[cnames], method=c("HDo", "PCS")) #O3m1 <- OutliersO3::O3plotM(O3m) #gridExtra::grid.arrange(O3m1$gO3, O3m1$gpcp, ncol=1)
#---------------------- categorical variables # use funModeling and information_theory measures, all: # entropy (en), mutual information (mi), information gain (ig), gain ratio (gr) d_res <- c("var1","var2"," "," "," ", " ") var1 <- character() var2 <- character() for (var1 in dnames1){ for (var2 in dnames1) { if(var1 != var2) { d_res <- cbind(d_res, c(var1, var2,funModeling::infor_magic(input = df[[var1]],target =df[[var2]]))) } } } d_res # continuous variables: using boot.ci seems to demand too much memory. To find a better solution
#mulitvar ---- with example series #tsmultiv <- ts(matrix(rnorm(3000),ncol=100),freq=4) #y <- anomalous::tsmeasures(tsmultiv) #anomalous::biplot.features(y) #anomalous::anomaly(y) #tsfeatures::tsfeatures(tsmultiv) #univar--- with example series #tsuniv <- ts(rnorm(1000)) #tsfeatures::tsfeatures(tsuniv) #tseries::jarque.bera.test(tsuniv) #tseries::kpss.test(tsuniv) #tseries::kpss.test(tsuniv, null = "Trend") #tseries::adf.test(tsuniv) #forecast::Acf(tsuniv) #forecast::Pacf(tsuniv)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.