library("opensignauxfaibles")
library("dplyr")
database_signauxfaibles <- database_connect()
table_wholesample <- collect_wholesample(db = database_signauxfaibles, table = "wholesample")
sample_train <- table_wholesample %>%
  filter(periode == "2014-01-01")
sample_test <- table_wholesample %>%
  filter(periode == "2015-01-01")
sample_actual <- table_wholesample %>%
  filter(periode == params$actual_date)
table_wholesample %>% 
  detect_na()
formulas_6_18 <- list(
  "effectif" = outcome_6_18 ~ cut_effectif,
  "growth_effectif" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing,
  "apart" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees,
  "cotisation_effectif" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif,
  "dettecumulee" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee,
  "croissancedettecumulee" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee,
  "nb_debits" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits,
  "delais" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits +
    delai + delai_sup_6mois,
  "codenaf" = outcome_6_18 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits +
    delai + delai_sup_6mois +
    libelle_naf_niveau1
)
plyr::ldply(
  .data = formulas_6_18,
  .fun = function(x) {
    glm(formula = x, family = "binomial", data = sample_train) %>%
    broom::augment(newdata = sample_test,  type.predict = "response") %>%
    pROC::roc(outcome_6_18 ~ .fitted, data = . , smooth = FALSE) %>%
    .$auc %>%
    tibble::as_tibble()
    }
  )  
output_prediction_6_18 <- formulas_6_18$dettecumulee %>%
  glm(
    formula = .,
    data = sample_train,
    family = "binomial"
    ) %>%
  broom::augment(
    newdata = sample_actual,
    type.predict = "response"
  ) %>%
  dplyr::rename(prediction_6_18 = .fitted) 
output_prediction_6_18 %>%
  dplyr::anti_join(
    y = compute_filter_proccollectives(
      db = database_signauxfaibles,
      .date = "2017-08-01"),
    by = "numero_compte",
    copy = TRUE
  ) %>%
  dplyr::anti_join(
    y = compute_filter_ccsv(
      db = database_signauxfaibles,
      .date = "2017-08-01"),
    by = c("numero_compte"),
    copy = TRUE
  ) %>%
  dplyr::arrange(
    dplyr::desc(
      prediction_6_18
    )
  ) %>%
  dplyr::filter(is.na(prediction_6_18) == FALSE) %>%
  dplyr::select(raison_sociale, siren, siret,
         libelle_naf_niveau1, code_departement, siege,
         prediction_6_18,
         effectif,
         apart_last12_months, apart_consommee, apart_share_heuresconsommees,
         nb_debits, delai, delai_sup_6mois
  ) %>%
  dplyr::slice(1:100) %>%
  readr::write_csv(path = "data/table_predictions_6_18_2017_09_18.csv")
output_prediction_6_18 %>%
  dplyr::anti_join(
    y = compute_filter_proccollectives(
      db = database_signauxfaibles,
      .date = "2017-08-01"),
    by = "numero_compte",
    copy = TRUE
  ) %>%
  dplyr::anti_join(
    y = compute_filter_ccsv(
      db = database_signauxfaibles,
      .date = "2017-08-01"),
    by = c("numero_compte"),
    copy = TRUE
  ) %>%
  dplyr::arrange(
    dplyr::desc(
      prediction_6_18
    )
  ) %>%
  dplyr::filter(is.na(prediction_6_18) == FALSE) %>%
  readr::write_csv(path = "data/table_predictions_6_18_2017_09_18_all.csv")


SGMAP-AGD/opensignauxfaibles documentation built on May 15, 2019, 1:26 p.m.