knitr::opts_chunk$set(echo = FALSE, warning = FALSE)
library(ggplot2)

Data set and main information needed

about: time-variables, modeled - variables, imputed-variables

For this report, a large sample from the wage data set is used, for speed and memory reasons but this can be easily run for the whole set.

ch <- RODBC::odbcDriverConnect("DRIVER={SQL Server}; SERVER=XXXXXX; trusted_connection=true")
df0 <- RODBC::sqlQuery(ch,"SELECT * FROM ZZZZZZZZZZZ where sector='B' ", as.is=TRUE)
close(ch)

if (nrow(df0) > 30000) {  df <- df0[sample(rownames(df0), size=30000), ]  } 

rm(df0)

micro <- df

dnames <-names(DataExplorer::split_columns(micro)$discrete)
cnames <- names(DataExplorer::split_columns(micro)$continuous)

ref_year <- 2000

#add a variable for the logarithm of wages
micro <-dplyr::mutate(micro,logwage=log(wage))
micro <- dplyr::mutate(micro,time_arbitr=((year_record-ref_year)*12+month_record) )

micro <- micro[which(micro$eduShort < 8 & micro$eduShort >0),]

micro$eduShort[micro$eduShort < 3 ] <- "a_low"  ### reference level
micro$eduShort[micro$eduShort >=3 & micro$eduShort <=5 ] <- "low"
micro$eduShort[micro$eduShort == 6 ] <- "med"
micro$eduShort[micro$eduShort >=7 & micro$eduShort <=8 ] <- "vhigh"

df <- dplyr::select(micro, -c(KT, MODUR_KFY, weight1, weight0, work_regime, month_record, wageMod, year_record, fullvin, sector))

Output of this report: pdf, html, word.

Plots are now static. We will produce soon interactive ones.

Dashboards are possible

Main packages and resources:

ggplot2, DataExplorer, funModeling, tabplot, forecast, tsfeatures, anomalous

view_data

Overview of main data-set characteristics

#view_data(df)
dnames <-names( DataExplorer::split_columns(df)$discrete)
cnames <- names( DataExplorer::split_columns(df)$continuous)

#short overview
 Hmisc::describe(df)

# data univariate plots
  DataExplorer::plot_intro(df)
  DataExplorer::plot_missing(df)

  #DataExplorer::plot_bar(df, maxcat=450)
  funModeling::freq(df)

  #DataExplorer::plot_histogram(df)
  funModeling::plot_num(df)

# table plots from tabplot package, for even smaller sample:

 if (nrow(df) > 10000) {  df_short <- df[sample(rownames(df), size=10000), ]  }    
  ### set this as on option whih could be adjusted

  #plot_list <-
  lapply(cnames, FUN=function(x0) {
   tabplot::tableplot(dat=df_short, sortCol=x0)$plot
  }
  )

view_univar

Marginal cumulative distributions

 lapply(cnames, FUN=function(var) {
   ggplot2::ggplot(df, ggplot2::aes(df[[var]])) +        ggplot2::stat_ecdf(geom = "point") +
   ggplot2::xlab(var) + 
     ggplot2::ylab("cumulative prob")
  }
  )

view_multivar

Pairwise bivariate distributions and correlation plots

Note that: printing is not yet "addapted" to size of data and paper

#  warning=FALSE and message=FALSE 

ggplot2::theme_set(ggplot2::theme_bw())


df_red <- dplyr::select(df, logwage, totexper, eduCode, KYN, occup, ageM)
dnames1 <-names(DataExplorer::split_columns(df_red)$discrete)
cnames1 <- names(DataExplorer::split_columns(df_red)$continuous)


lapply(dnames1, FUN=function(varr) {
    GGally::ggpairs(
                  df_red
                 , ggplot2::aes( colour = df_red[[varr]] ),
                 cardinality_threshold = 50
                    )
                                  }
       )


## ------ most general correlation plots: for any type of var
# corrr::correlate(df) is an other option

# printing is not perfect
DataExplorer::plot_correlation(
      data=df_red, 
      type = "all",
          maxcat = 50L, cor_args = list(),
          title = NULL,
          ggtheme = theme_gray(),
          theme_config = list(legend.position ="bottom",
          axis.text.x = element_text(angle=90)))

view_outliers

Plots and boxplots,limits based on Tukey and Hampel methods

# qqplots of continuous variables ------------------
DataExplorer::plot_qq(df_red)

# boxplots by each discrete -----------------
#
lapply(dnames1, FUN=function(varr) {
  DataExplorer::plot_boxplot( df_red, by=varr , geom_boxplot_args = list("outlier.color"="red"))
  }
      )

# univar limits, using Tukey (interquartiles)
knitr::kable(
lapply(cnames1, FUN=function(x0) {
  c(
    x0,
    funModeling::tukey_outlier(as.data.frame(df_red)[[x0]])
   )
                                 }
      )
, format="markdown", col.names = " "
            )


# univar limits, using Hampel (median based )

knitr::kable(
  lapply(cnames1, FUN=function(x0) {

    c( x0, funModeling::hampel_outlier(df_red[[x0]]) )
}
)
, format="markdown", col.names = " "
)


# more tests for outliers 
#------ one score for each record, for each variable
# too long. To find a representation of this!

#lapply(cnames, FUN=function(x0){
#which(
#  outliers::scores(df[[x0]], type="t") == TRUE 
#   )
 ## other options
 ## outliers::scores(df[[x0]], type="z")

 ## outliers::scores(df[[x0]], type="iqr")
 #                             }
 #      )

#----- comparing many methods, using package "OutliersO3" ---------
# it works on continuous variables

# comparing two methods
# could add four more, at least: "BAC", "adjOut", "DDC, "MCD"
# but this is rather slow even with two

#O3m <- OutliersO3::O3prep(df[cnames], method=c("HDo", "PCS"))
#O3m1 <- OutliersO3::O3plotM(O3m)
#gridExtra::grid.arrange(O3m1$gO3, O3m1$gpcp, ncol=1)

view_assoc

With validation rules mining potential.Under development.


view_clusters

Potentialy identifying unwanted structures or confirming known ones. Under development.


rev_variability

information theory based measures, for categorical variables

#---------------------- categorical variables
# use funModeling and information_theory measures, all:
# entropy (en), mutual information (mi), information gain (ig), gain ratio (gr)


d_res <- c("var1","var2"," "," "," ", " ")
var1 <- character()
var2 <- character()
for (var1 in dnames1){
  for (var2 in dnames1)
    {
    if(var1 != var2) {
    d_res <- cbind(d_res, c(var1, var2,funModeling::infor_magic(input = df[[var1]],target =df[[var2]]))) }

    }
}

d_res

# continuous variables: using boot.ci seems to demand too much memory. To find a better solution

rev_ts

univariate and multivariate time series: detection of anomalous features, tests of stationarity and (auto/cross)-correlation

#mulitvar ---- with example series
#tsmultiv <- ts(matrix(rnorm(3000),ncol=100),freq=4)
#y <- anomalous::tsmeasures(tsmultiv)
#anomalous::biplot.features(y)
#anomalous::anomaly(y)
#tsfeatures::tsfeatures(tsmultiv)

#univar--- with example series
#tsuniv <- ts(rnorm(1000))
#tsfeatures::tsfeatures(tsuniv)
#tseries::jarque.bera.test(tsuniv)
#tseries::kpss.test(tsuniv)
#tseries::kpss.test(tsuniv, null = "Trend")
#tseries::adf.test(tsuniv)
#forecast::Acf(tsuniv)
#forecast::Pacf(tsuniv)

rev_model()

Model testing


check_assumptions()

Checking test or model assumptions aout data


reviewed

Reporting function




violetacln/revaliew documentation built on March 17, 2021, 6:02 p.m.