R/get_sample_bcocmr.R

#' @importFrom dplyr rename_all
clean_df <- function(d) {

  d <- mutate_if(d, function(x){ "integer64" %in% class(x) }, as.numeric)
  d <- rename_all(d, tolower)
  d

}

#'
#' Obtener muestra desde poblacion
#'
#' @param dpob dpob
#' @param nmuestra nmuestra
#' @param ponderado ponderado
#' @param seed seed
#' @importFrom utils menu
#' @importFrom stats qnorm
#' @importFrom dplyr anti_join summarise_if vars bind_rows
#' @importFrom tidyr gather spread
#' @examples
#'
#' \dontrun{
#' library(modflblla)
#' library(tidyverse)
#' con <- DBI::dbConnect(odbc::odbc(), driver = "Microsoft SQL Server", dsn = "ve_paris")
#' DBI::dbSendQuery(con, "USE MATRIX")
#'
#' dpob <- tbl(con, "DATA_CMR_BCO_RENEG")
#' get_sample_bcocmr(dpob)
#' get_sample_bcocmr(dpob, 300000)
#'
#' dpob <- readRDS("D:/Docs/modelo-behavior-fusion/data/20/01_doper2.rds")
#' dpob <- dpob %>% select(rut, operacion, mes_matriz, marca_inc_12m, comp_n)
#'
#' }
#' @export
get_sample_bcocmr <- function(dpob, nmuestra = 118626, ponderado = TRUE, seed = 1234) {

  message("Obteniendo resumen de la poblacion (tasas de malos, ponderados, proporcion por mes)")

  # ANOMES           total   bad   good good_w
  # <S3: integer64>  <int> <dbl>  <dbl>  <dbl>
  # 1 201003          165644 63063 102581 91227.
  # 2 201004          136702 53269  83433 73952.
  # 3 201005          147270 54208  93062 82700.
  # 4 201006          129169 49131  80038 69329.
  # 5 201007          144961 50456  94505 82014.
  # 6 201008          145079 50365  94714 82798.
  # 7 201009          125701 47793  77908 69162.
  # 8 201010          139632 50939  88693 79094.
  # 9 201011          122905 48076  74829 65959.
  # 10 201012          126686 49899  76787 67436.
  #
  # dpob <- tbl(con, "data_cmr_bco_mora") %>%
  #   filter(ANOMES >= 201003) %>%
  #   mutate(
  #     marca_inc_12m = ifelse(DEF == 1, 1, 0),
  #     comp_n = ifelse(marca_inc_12m == 1, 12, 12 * INCUMPLIMIENTO)
  #     ) %>%
  #   rename(
  #     rut = RUT, mes_matriz = ANOMES
  #   ) %>%
  #   select(rut, mes_matriz, marca_inc_12m, comp_n)
  # dpob

  if(!ponderado) {

    dpob <- dpob %>%
      mutate(comp_n = 12)

  }

  dpob_res <- dpob %>%
    mutate(pond = ifelse(marca_inc_12m == 1, 1, comp_n/12)) %>%
    group_by(mes_matriz) %>%
    summarise(
      total = n(),
      malos = sum(marca_inc_12m),
      buenos = sum(1 - marca_inc_12m),
      buenos_pond = sum((1  - marca_inc_12m) * pond)
    ) %>%
    mutate(
      total_pond = buenos_pond + malos,
      br = malos/total,
      br_pond = malos/total_pond
    ) %>%
    collect() %>%
    mutate(
      prop = total/sum(total),
      prop_pond = total_pond/sum(total_pond)
    ) %>%
    modflblla:::clean_df()

  dpob_res <- dpob_res %>%
    mutate(
      muestra_n_bueno = round(prop_pond * nmuestra * (1 - br_pond)),
      muestra_n_malo = round(prop_pond * nmuestra * br_pond)
    )

  # no siempre da justo
  diff <- dpob_res %>%
    select(muestra_n_bueno, muestra_n_malo) %>%
    summarise_if(is.numeric, sum) %>%
    mutate(
      muestra_n2 = muestra_n_bueno + muestra_n_malo,
      diff = nmuestra - muestra_n2
    )

  diff
  diff <- diff %>% pull(diff)

  fix <- c(
    rep(sign(diff), abs(diff)),
    rep(0, 2 * nrow(dpob_res) - abs(diff))
  )

  set.seed(seed)

  fix <- sample(fix)

  dpob_res <- dpob_res %>%
    mutate(
      muestra_n_bueno = muestra_n_bueno + fix[1:nrow(dpob_res)],
      muestra_n_malo = muestra_n_malo + fix[(nrow(dpob_res) + 1):(2*nrow(dpob_res))]
    )

  diff <- dpob_res %>%
    select(muestra_n_bueno, muestra_n_malo) %>%
    summarise_if(is.numeric, sum) %>%
    mutate(
      muestra_n2 = muestra_n_bueno + muestra_n_malo,
      diff = nmuestra - muestra_n2
    )

  dsample <- structure(
    list(
      rut = numeric(0),
      operacion = numeric(0),
      mes_matriz = numeric(0),
      marca_inc_12m = numeric(0),
      comp_n = numeric(0)
    ),
    row.names = c(NA, 0L),
    class = c("tbl_df", "tbl", "data.frame")
  )

  message("Obteniendo resumen de clientes (apariciones -> pesos)")

  dcli <- dpob %>%
    group_by(rut) %>%
    summarise(n = n()) %>%
    collect() %>%
    mutate(weight = max(n) - n + 1) %>%
    modflblla:::clean_df()

  dcli

  meses <- dpob_res %>%
    distinct(mes_matriz) %>%
    arrange(mes_matriz) %>%
    pull(mes_matriz)

  message("Ingresando a ciclo")

  for(m in rev(meses)) {

    message(rep("*", 10))
    message("mes: ", m)

    # datos a obtener del mes
    dpob_mes <- filter(dpob_res, mes_matriz == m)

    # poblacion en el mes
    dmes <- dpob %>%
      filter(mes_matriz == m) %>%
      collect() %>%
      modflblla:::clean_df()

    # dejamos una operacion por rut al azar
    set.seed(seed)
    dmes <- dmes %>%
      sample_frac(1) %>%
      distinct(rut, .keep_all = TRUE)

    # se eliminan los ruts que ya se seleccionaron previamente
    # y se pegan los pesos
    dmes <- dmes %>%
      anti_join(dsample, by = "rut") %>%
      left_join(dcli, by = "rut")

    # muestra malos
    set.seed(seed)
    dmes_malos <- dmes %>%
      filter(marca_inc_12m == 1) %>%
      sample_n(size = pull(dpob_mes, muestra_n_malo), weight = weight)

    # muestra buenos
    set.seed(seed)
    dmes_buenos <- dmes %>%
      filter(marca_inc_12m == 0) %>%
      sample_n(size = pull(dpob_mes, muestra_n_bueno), weight = weight)

    dmes <- bind_rows(dmes_malos, dmes_buenos)
    dmes <- select(dmes, -n, -weight)

    dsample <- bind_rows(dsample, dmes)

  }

  dpob_res <- dsample %>%
    group_by(mes_matriz) %>%
    summarise(br_muestra = mean(marca_inc_12m), prop_sample = n()) %>%
    mutate(prop_sample = prop_sample/sum(prop_sample)) %>%
    left_join(dpob_res, ., by = "mes_matriz")

  p <- dpob_res %>%
    gather(key, value, -mes_matriz) %>%
    mutate(mes_matriz = lubridate::ymd(paste0(mes_matriz, "01"))) %>%
    ggplot() +
    geom_line(aes(mes_matriz, value, color = key)) +
    scale_color_viridis_d() +
    scale_y_continuous(limits = c(0, NA)) +
    facet_wrap(vars(key), scales = "free_y") +
    theme_minimal() +
    theme(legend.position = "none")

  print(p)

  dsample

}


#' Obtener tamaio muestral
#' @param N N
#' @param p p
#' @param D D
#' @param alpha alpha
#' @export
get_N_sample <- function(N = 100000, p = 0.1, D = .25/100, alpha = 1/100) {

  Z <- qnorm(1 - alpha/2)

  message("\tN: ", scales::comma(N))
  message("\tp: ", scales::percent(p))
  message("\talpha: ", scales::percent(alpha))
  message("\tD: ", scales::percent(D))
  message("\tZ: ", round(Z, 3))

  n <- (N * Z * Z * p * (1 - p)) /
    (D * D * (N - 1) + Z * Z * p * (1 - p))

  n <- round(n)

  message("\tn: ", scales::comma(n))

  n

}
jbkunst/modflblla documentation built on June 21, 2019, 12:53 p.m.