README.md

eat

The EAT algorithm performs a regression tree based on CART methodology under a new approach that guarantees obtaining a frontier as estimator that fulfills the property of free disposability. This new technique has been baptized as Efficiency Analysis Trees. Some of its main functions are:

Installation

You can install the released version of eat from CRAN with:

install.packages("eat")

And the development version from GitHub with:

devtools::install_github("MiriamEsteve/EAT")

Example

library(eat)
data("PISAindex")
single_model <- EAT(data = PISAindex, 
                    x = 15, # input 
                    y = 3) # output
#> [conflicted] Will prefer dplyr::filter over any other package
print(single_model)
#>  [1] y: [ 551 ] || R: 11507.5 n(t): 72 
#>  
#>  |  [2] PFC < 77.2 --> y: [ 478 ] || R: 2324.47 n(t): 34 
#>  
#>  |   |  [4] PFC < 65.45 --> y: [ 428 ] <*> || R: 390.17 n(t): 16 
#>  
#>  |   |  [5] PFC >= 65.45 --> y: [ 478 ] <*> || R: 637.08 n(t): 18 
#>  
#>  |  [3] PFC >= 77.2 --> y: [ 551 ] <*> || R: 2452.83 n(t): 38 
#>  
#> <*> is a leaf node
summary(single_model)
#> 
#>   Formula:  S_PISA ~ PFC 
#> 
#>  # ========================== # 
#>  #   Summary for leaf nodes   # 
#>  # ========================== # 
#>  
#>  id n(t)  % S_PISA    R(t)
#>   3   38 53    551 2452.83
#>   4   16 22    428  390.17
#>   5   18 25    478  637.08
#> 
#>  # ========================== # 
#>  #            Tree            # 
#>  # ========================== # 
#>  
#>  Interior nodes: 2 
#>      Leaf nodes: 3 
#>     Total nodes: 5 
#>  
#>            R(T): 3480.08 
#>         numStop: 5 
#>            fold: 5 
#>       max.depth: 
#>      max.leaves:
#>  
#>  # ========================== # 
#>  # Primary & surrogate splits # 
#>  # ========================== # 
#>  
#>  Node 1 --> {2,3} || PFC --> {R: 4777.31, s: 77.2}
#> 
#>  Node 2 --> {4,5} || PFC --> {R: 1027.25, s: 65.45}
size(single_model)
#> The number of leaf nodes of the EAT model is: 3
frontier.levels(single_model)
#> The frontier levels of the outputs at the leaf nodes are:

#>   S_PISA
#> 1    551
#> 2    428
#> 3    478
descriptiveEAT <- descrEAT(single_model)

descriptiveEAT
#>   Node n(t)   %   mean     var    sd min     Q1 median     Q3 max   RMSE
#> 1    1   72 100 455.06 2334.59 48.32 336 416.75  466.0 495.25 551 107.27
#> 2    2   34  47 416.88 1223.02 34.97 336 397.25  415.5 435.75 478  70.16
#> 3    3   38  53 489.21  851.95 29.19 419 478.00  494.0 504.50 551  68.17
#> 4    4   16  22 394.62  684.65 26.17 336 381.50  398.0 414.00 428  41.90
#> 5    5   18  25 436.67  889.29 29.82 386 415.25  433.5 468.00 478  50.48
frontier(object = single_model,
         FDH = TRUE, 
         observed.data = TRUE,
         rwn = TRUE)
#> Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps

multioutput <- EAT(data = PISAindex, 
                   x = 6:18,
                   y = 3:5)
#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> Warning in preProcess(data = data, x = x, y = y, numStop = numStop, fold = fold, : Rows with NA values have been omitted .
rankingEAT(object = multioutput,
           barplot = TRUE,
           threshold = 70,
           digits = 2)
#> $scores
#>         Importance
#> AAE         100.00
#> WS           98.45
#> S            84.51
#> NBMC         83.37
#> HW           83.31
#> ABK          67.97
#> GDP_PPP      65.37
#> AIC          64.89
#> EQ           57.11
#> PR           57.05
#> I            57.05
#> PS           45.41
#> PFC          31.67
#> 
#> $barplot

plotEAT(object = multioutput)

n <- nrow(PISAindex) # Observations in the dataset
t_index <- sample(1:n, n * 0.7) # Training indexes
training <- PISAindex[t_index, ] # Training set
test <- PISAindex[-t_index, ] # Test set

bestEAT(training = training, 
        test = test,
        x = 6:18,
        y = 3:5,
        numStop = c(5, 7, 10),
        fold = c(5, 7))
#> Warning in preProcess(test, x, y, na.rm = na.rm): Rows with NA values have been omitted .

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#>   numStop fold  RMSE leaves
#> 1       7    5 66.94     10
#> 2       7    7 66.94     10
#> 3       5    7 71.87      8
#> 4       5    5 84.60      7
#> 5      10    5 85.06      5
#> 6      10    7 85.06      5
single_model <- EAT(data = PISAindex, 
                    x = 15, # input 
                    y = 3) # output
#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package
scores_EAT <-  efficiencyEAT(data = PISAindex,
                            x = 15, 
                            y = 3,
                            object = single_model, 
                            scores_model = "BCC.OUT",
                            digits = 3,
                            FDH = TRUE)
#>     EAT_BCC_OUT FDH_BCC_OUT
#> SGP       1.000       1.000
#> JPN       1.042       1.000
#> KOR       1.062       1.000
#> EST       1.040       1.000
#> NLD       1.095       1.095
#> POL       1.078       1.000
#> CHE       1.113       1.113
#> CAN       1.064       1.064
#> DNK       1.118       1.118
#> SVN       1.087       1.024
#> BEL       1.104       1.062
#> FIN       1.056       1.056
#> SWE       1.104       1.104
#> GBR       1.091       1.091
#> NOR       1.124       1.124
#> DEU       1.095       1.095
#> IRL       1.111       1.069
#> AUT       1.124       1.082
#> CZE       1.109       1.044
#> LVA       1.131       1.066
#> FRA       1.118       1.075
#> ISL       1.160       1.116
#> NZL       1.085       1.043
#> PRT       1.120       1.055
#> AUS       1.095       1.054
#> RUS       1.000       1.000
#> ITA       1.021       1.021
#> SVK       1.187       1.037
#> LUX       1.155       1.155
#> HUN       1.146       1.000
#> LTU       1.143       1.060
#> ESP       1.141       1.075
#> USA       1.098       1.056
#> BLR       1.015       1.015
#> MLT       1.193       1.106
#> HRV       1.006       1.006
#> ISR       1.193       1.106
#> TUR       1.021       1.000
#> UKR       1.019       1.000
#> CYP       1.255       1.182
#> GRC       1.058       1.058
#> SRB       1.086       1.000
#> MYS       1.091       1.068
#> ALB       1.026       1.000
#> BGR       1.127       1.127
#> ARE       1.270       1.177
#> MNE       1.152       1.128
#> ROU       1.122       1.122
#> KAZ       1.204       1.179
#> MDA       1.000       1.000
#> AZE       1.075       1.048
#> THA       1.005       1.005
#> URY       1.293       1.200
#> CHL       1.241       1.169
#> QAT       1.315       1.239
#> MEX       1.021       1.021
#> BIH       1.075       1.048
#> CRI       1.149       1.149
#> JOR       1.114       1.093
#> PER       1.059       1.032
#> GEO       1.117       1.089
#> MKD       1.036       1.036
#> LBN       1.115       1.115
#> COL       1.036       1.036
#> BRA       1.183       1.158
#> ARG       1.183       1.158
#> IDN       1.081       1.081
#> SAU       1.238       1.215
#> MAR       1.135       1.135
#> PAN       1.173       1.173
#> PHL       1.199       1.168
#> DOM       1.274       1.241
#> 
#>  Model  Mean Std. Dev. Min    Q1 Median    Q3   Max
#>    EAT 1.114     0.074   1 1.061  1.110 1.110 1.315
#>    FDH 1.081     0.065   1 1.030  1.069 1.069 1.241
scores_CEAT <- efficiencyCEAT(data = PISAindex,
                              x = 15, 
                              y = 3,
                              object = single_model, 
                              scores_model = "BCC.INP",
                              digits = 3,
                              DEA = TRUE)
#>     CEAT_BCC_INP DEA_BCC_INP
#> SGP        0.878       1.000
#> JPN        0.872       0.986
#> KOR        0.878       0.989
#> EST        0.857       0.969
#> NLD        0.736       0.824
#> POL        0.862       0.968
#> CHE        0.697       0.777
#> CAN        0.768       0.865
#> DNK        0.693       0.772
#> SVN        0.821       0.920
#> BEL        0.735       0.821
#> FIN        0.787       0.888
#> SWE        0.724       0.809
#> GBR        0.750       0.840
#> NOR        0.680       0.757
#> DEU        0.723       0.809
#> IRL        0.731       0.816
#> AUT        0.712       0.792
#> CZE        0.788       0.880
#> LVA        0.758       0.843
#> FRA        0.725       0.808
#> ISL        0.669       0.739
#> NZL        0.769       0.862
#> PRT        0.776       0.865
#> AUS        0.754       0.845
#> RUS        0.846       0.936
#> ITA        0.756       0.832
#> SVK        0.717       0.787
#> LUX        0.660       0.729
#> HUN        0.779       0.864
#> LTU        0.768       0.851
#> ESP        0.755       0.838
#> USA        0.756       0.846
#> BLR        0.750       0.826
#> MLT        0.703       0.771
#> HRV        0.792       0.875
#> ISR        0.703       0.771
#> TUR        0.866       0.953
#> UKR        0.831       0.916
#> CYP        0.628       0.678
#> GRC        0.754       0.822
#> SRB        0.767       0.829
#> MYS        0.734       0.792
#> ALB        1.000       1.000
#> BGR        0.661       0.691
#> ARE        0.616       0.663
#> MNE        0.698       0.698
#> ROU        0.662       0.701
#> KAZ        0.696       0.696
#> MDA        0.771       0.825
#> AZE        0.967       0.967
#> THA        0.744       0.787
#> URY        0.602       0.637
#> CHL        0.638       0.692
#> QAT        0.591       0.599
#> MEX        0.746       0.755
#> BIH        0.782       0.782
#> CRI        0.615       0.615
#> JOR        0.682       0.731
#> PER        0.795       0.795
#> GEO        0.798       0.798
#> MKD        0.768       0.768
#> LBN        0.735       0.735
#> COL        0.739       0.739
#> BRA        0.697       0.697
#> ARG        0.693       0.693
#> IDN        0.735       0.735
#> SAU        0.674       0.674
#> MAR        0.748       0.748
#> PAN        0.770       0.770
#> PHL        0.780       0.780
#> DOM        0.804       0.804
#> 
#>  Model  Mean Std. Dev.   Min    Q1 Median    Q3 Max
#>   CEAT 0.749     0.077 0.591 0.698  0.749 0.749   1
#>    DEA 0.805     0.094 0.599 0.739  0.801 0.801   1
efficiencyJitter(object = single_model,
                 df_scores = scores_EAT$EAT_BCC_OUT,
                 scores_model = "BCC.OUT",
                 lwb = 1.2)

efficiencyDensity(df_scores = scores_EAT[, 3:4],
                  model = c("EAT", "FDH"))

forest <- RFEAT(data = PISAindex, 
                x = 6:18, # input 
                y = 5, # output
                numStop = 5, 
                m = 30,
                s_mtry = "BRM",
                na.rm = TRUE)
#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package
print(forest)
#> 
#>   Formula:  M_PISA ~ NBMC + WS + S + PS + ABK + AIC + HW + EQ + PR + PFC + I + AAE + GDP_PPP 
#> 
#>  # ========================== # 
#>  #           Forest           # 
#>  # ========================== # 
#>  
#>  Error: 738.42
#>  numStop: 5
#>  No. of trees (m): 30
#>  No. of inputs tried (s_mtry): BRM
plotRFEAT(forest)

rankingRFEAT(object = forest, barplot = TRUE,
             digits = 2)
#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> $scores
#>         Importance
#> PS           14.25
#> PR           14.08
#> AAE          13.97
#> EQ           11.86
#> S            10.80
#> HW            9.36
#> AIC           6.36
#> I             4.49
#> NBMC          3.26
#> WS           -1.79
#> GDP_PPP      -4.68
#> PFC          -4.77
#> ABK          -6.11
#> 
#> $barplot

bestRFEAT(training = training, 
          test = test,
          x = 6:18,
          y = 3:5,
          numStop = c(5, 10),
          m = c(30, 40),
          s_mtry = c("BRM", "3"))
#> Warning in preProcess(test, x, y, na.rm = na.rm): Rows with NA values have been omitted .

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#>   numStop  m s_mtry  RMSE
#> 1       5 40      3 57.44
#> 2       5 40    BRM 57.72
#> 3       5 30    BRM 58.39
#> 4       5 30      3 59.13
#> 5      10 30    BRM 62.43
#> 6      10 40    BRM 63.18
#> 7      10 40      3 65.02
#> 8      10 30      3 68.43
efficiencyRFEAT(data = PISAindex,
                x = 6:18, # input
                y = 5, # output
                object = forest,
                FDH = TRUE)
#>     RFEAT_BCC_OUT FDH_BCC_OUT
#> SGP         0.936       1.000
#> JPN         1.024       1.000
#> KOR         1.004       1.000
#> EST         0.982       1.000
#> NLD         0.999       1.000
#> POL         0.982       1.000
#> CHE         1.026       1.002
#> CAN         1.010       1.008
#> DNK         1.017       1.014
#> SVN         1.004       1.000
#> BEL         1.006       1.000
#> FIN         1.021       1.018
#> SWE         1.029       1.028
#> GBR         1.019       1.000
#> NOR         1.039       1.030
#> DEU         1.028       1.032
#> IRL         1.031       1.032
#> AUT         1.024       1.034
#> CZE         1.014       1.000
#> LVA         0.997       1.000
#> FRA         1.037       1.000
#> ISL         1.070       1.042
#> NZL         1.048       1.045
#> PRT         0.995       1.000
#> AUS         1.054       1.051
#> RUS         0.963       1.000
#> ITA         1.011       1.000
#> SVK         0.997       1.000
#> LUX         1.053       1.000
#> HUN         1.008       1.000
#> LTU         1.020       1.000
#> ESP         1.030       1.000
#> USA         1.033       1.000
#> BLR         0.992       1.000
#> MLT         1.026       1.000
#> HRV         1.034       1.000
#> ISR         1.050       1.000
#> TUR         0.979       1.000
#> UKR         0.981       1.000
#> CYP         1.095       1.000
#> GRC         1.063       1.007
#> SRB         1.002       1.000
#> MYS         0.998       1.000
#> ALB         0.982       1.000
#> BGR         1.031       1.000
#> ARE         1.014       1.000
#> MNE         1.022       1.000
#> ROU         1.031       1.000
#> KAZ         1.018       1.000
#> MDA         1.003       1.000
#> AZE         0.972       1.000
#> THA         0.991       1.000
#> URY         1.045       1.000
#> CHL         1.097       1.005
#> QAT         1.070       1.000
#> MEX         1.005       1.000
#> BIH         1.042       1.000
#> CRI         1.085       1.000
#> JOR         1.046       1.000
#> PER         0.997       1.000
#> GEO         1.060       1.000
#> MKD         1.052       1.000
#> LBN         1.037       1.000
#> COL         1.055       1.000
#> BRA         1.074       1.000
#> ARG         1.157       1.000
#> IDN         1.034       1.000
#> SAU         1.100       1.000
#> MAR         1.031       1.000
#> PAN         1.116       1.000
#> PHL         1.049       1.000
#> DOM         1.145       1.000
#> 
#>  Model  Mean Std. Dev.   Min    Q1 Median    Q3   Max
#>  RFEAT 1.029     0.039 0.936 1.003  1.026 1.026 1.157
#>    FDH 1.005     0.012 1.000 1.000  1.000 1.000 1.051
input <- c(6, 7, 8, 12, 17)
output <- 3:5

EAT_model <- EAT(data = PISAindex, x = input, y = output)
#> [conflicted] Removing existing preference

#> [conflicted] Will prefer dplyr::filter over any other package

#> Warning in preProcess(data = data, x = x, y = y, numStop = numStop, fold = fold, : Rows with NA values have been omitted .
RFEAT_model <- RFEAT(data = PISAindex, x = input, y = output)
#> [conflicted] Removing existing preference
#> [conflicted] Will prefer dplyr::filter over any other package

#> Warning in preProcess(data = data, x = x, y = y, numStop = numStop, na.rm = na.rm): Rows with NA values have been omitted .
# PREDICTIONS
predictions_EAT <- predict(object = EAT_model, newdata = PISAindex[, input])
predictions_RFEAT <- predict(object = RFEAT_model, newdata = PISAindex[, input])

Please, check the vignette for more details.



Try the eat package in your browser

Any scripts or data that you put into this service are public.

eat documentation built on Jan. 10, 2023, 5:13 p.m.