params$actual_date
library("opensignauxfaibles")
library("dplyr")
database_signauxfaibles <- database_connect()
table_wholesample <- collect_wholesample(db = database_signauxfaibles, table = "wholesample")
sample_train <- table_wholesample %>%
  filter(periode == "2015-01-01") %>%
  filter(effectif >= 10)
sample_test <- table_wholesample %>%
  filter(periode == "2016-01-01") %>%
  filter(effectif >= 10)
sample_actual <- table_wholesample %>%
  filter(periode == params$actual_date) %>%
  filter(effectif >= 10)
table_wholesample %>% 
  detect_na()
formulas_0_12 <- list(
  "effectif" = outcome_0_12 ~ cut_effectif,
  "growth_effectif" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing,
  "apart" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees,
  "cotisation_effectif" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif,
  "dettecumulee" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation_12m + indicatrice_dettecumulee_12m,
  "croissancedettecumulee" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee,
  "nb_debits" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits,
  "delais" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits +
    delai + delai_sup_6mois,
  "codenaf" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
    apart_last12_months + apart_consommee + apart_share_heuresconsommees +
    log_cotisationdue_effectif +
    log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
    indicatrice_croissance_dettecumulee +
    nb_debits +
    delai + delai_sup_6mois +
    libelle_naf_niveau1
)
plyr::ldply(
  .data = formulas_0_12,
  .fun = function(x) {
    glm(formula = x, family = "binomial", data = sample_train) %>%
    broom::augment(newdata = sample_test,  type.predict = "response") %>%
    pROC::roc(outcome_0_12 ~ .fitted, data = . , smooth = FALSE) %>%
    .$auc %>%
    tibble::as_tibble()
    }
  )  
output_prediction_0_12 <- formulas_0_12$dettecumulee %>%
  glm(
    formula = .,
    data = sample_train,
    family = "binomial"
    ) %>%
  broom::augment(
    newdata = sample_actual,
    type.predict = "response"
  ) %>%
  dplyr::rename(prediction_0_12 = .fitted) 
output_prediction_0_12 %>%
  dplyr::anti_join(
    y = compute_filter_proccollectives(
      db = database_signauxfaibles,
      .date = "2017-10-01"),
    by = "numero_compte",
    copy = TRUE
  ) %>%
  dplyr::anti_join(
    y = compute_filter_ccsv(
      db = database_signauxfaibles,
      .date = "2017-10-01"),
    by = c("numero_compte"),
    copy = TRUE
  ) %>%
  dplyr::arrange(
    dplyr::desc(
      prediction_0_12
    )
  ) %>%
  dplyr::filter(is.na(prediction_0_12) == FALSE) %>%
  dplyr::select(
        raison_sociale, siret, 
        libelle_naf_niveau1, 
        code_departement, 
        prediction_0_12,
        effectif,
        apart_last12_months, apart_consommee, apart_share_heuresconsommees,
        nb_debits, delai, delai_sup_6mois, everything()
  ) %>%
  dplyr::slice(1:100) %>%
  readr::write_csv(path = "output/table_predictions_2017_11_29.csv")
output_prediction_0_12 %>%
  dplyr::anti_join(
    y = compute_filter_proccollectives(
      db = database_signauxfaibles,
      .date = "2017-10-01"),
    by = "numero_compte",
    copy = TRUE
  ) %>%
  dplyr::anti_join(
    y = compute_filter_ccsv(
      db = database_signauxfaibles,
      .date = "2017-10-01"),
    by = c("numero_compte"),
    copy = TRUE
  ) %>%
  dplyr::arrange(
    dplyr::desc(
      prediction_0_12
    )
  ) %>%
  dplyr::filter(is.na(prediction_0_12) == FALSE) %>%
  readr::write_csv(path = "output/table_predictions_2017_11_29_all.csv")


SGMAP-AGD/opensignauxfaibles documentation built on May 15, 2019, 1:26 p.m.