library("opensignauxfaibles")
library("broom")
library("dplyr")
database_signauxfaibles <- database_connect()
table_wholesample <- collect_wholesample(db = database_signauxfaibles, table = "wholesample")
sample_train <- table_wholesample %>%
  filter(periode == params$start)
sample_test <- table_wholesample %>%
  filter(periode == params$test)
sample_actual <- table_wholesample %>%
  filter(periode == params$actual)
sample_train %>%
  count(outcome_0_12) %>%
  mutate(share = 100 * n / sum(n))
formulas_0_12 <- list(
  "m0" = outcome_0_12 ~ cut_effectif,
  "m1" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing,
  "m2" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees,
  "m3" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif,
  "m4" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee,
  "m5" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee,
  "m6" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits,
  "m7" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits +
    delai + delai_sup_6mois,
  "m8" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits +
    delai + delai_sup_6mois +
    libelle_naf_niveau1
)
glm(
  data = sample_train,
  formula = outcome_0_12 ~ cut_effectif,
  family = binomial
  ) %>%
  tidy()
glm(formula = outcome_0_12 ~ cut_effectif, family = "binomial", data = sample_train) %>%
  broom::augment(newdata = sample_test,  type.predict = "response") %>%
  pROC::roc(outcome_0_12 ~ .fitted, data = . , smooth = FALSE) %>%
  .$auc
glm(formula = formulas_0_12$m1,
    data = sample_train, family = binomial) %>%
  tidy()
lrtest(
  glm(formula = formulas_0_12$m0, family = "binomial", data = sample_train),
  glm(formula = formulas_0_12$m1, family = "binomial", data = sample_train)
  )
glm(formula = formulas_0_12$m1, family = "binomial", data = sample_train) %>%
  broom::augment(newdata = sample_test,  type.predict = "response") %>%
  pROC::roc(outcome_0_12 ~ .fitted, data = . , smooth = FALSE) %>%
  .$auc

Activité partielle

glm(formula = formulas_0_12$m2, family = "binomial", data = sample_train) %>%
  tidy()
compute_auc(f = formulas_0_12$m2, df_train = sample_train, df_test = sample_test)
lrtest(
  glm(formula = formulas_0_12$m1, family = "binomial", data = sample_train),
  glm(formula = formulas_0_12$m2, family = "binomial", data = sample_train)
  )
glm(formula = formulas_0_12$m3, family = "binomial", data = sample_train) %>%
  tidy()
compute_auc(f = formulas_0_12$m3, df_train = sample_train, df_test = sample_test)
lrtest(
  glm(formula = formulas_0_12$m2, family = "binomial", data = sample_train),
  glm(formula = formulas_0_12$m3, family = "binomial", data = sample_train)
  )
glm(formula = formulas_0_12$m4, family = "binomial", data = sample_train) %>%
  tidy()
lrtest(
  glm(formula = formulas_0_12$m3, family = "binomial", data = sample_train),
  glm(formula = formulas_0_12$m4, family = "binomial", data = sample_train)
  )

Tests du rapport de vraisemblance pour les différents modèles testés

plyr::ldply(
  .data = 2:9,
  .fun = function(x) {
    lrtest(
      glm(formula = formulas_0_12[[x-1]], family = "binomial", data = sample_train),
      glm(formula = formulas_0_12[[x]], family = "binomial", data = sample_train)
      ) %>%
      tidy() %>%
      filter(is.na(statistic) == FALSE) %>%
      select(statistic, p.value) %>%
      mutate(
        reject = (p.value <= .05),
        model_number = x)
    }
  )
plyr::ldply(.data = formulas_0_12, .fun = function(x) {compute_auc(f = x, df_train = sample_train, df_test = sample_test)})


SGMAP-AGD/opensignauxfaibles documentation built on May 15, 2019, 1:26 p.m.