#' @importFrom dplyr rename_all
clean_df <- function(d) {
d <- mutate_if(d, function(x){ "integer64" %in% class(x) }, as.numeric)
d <- rename_all(d, tolower)
d
}
#'
#' Obtener muestra desde poblacion
#'
#' @param dpob dpob
#' @param nmuestra nmuestra
#' @param ponderado ponderado
#' @param seed seed
#' @importFrom utils menu
#' @importFrom stats qnorm
#' @importFrom dplyr anti_join summarise_if vars bind_rows
#' @importFrom tidyr gather spread
#' @examples
#'
#' \dontrun{
#' library(modflblla)
#' library(tidyverse)
#' con <- DBI::dbConnect(odbc::odbc(), driver = "Microsoft SQL Server", dsn = "ve_paris")
#' DBI::dbSendQuery(con, "USE MATRIX")
#'
#' dpob <- tbl(con, "DATA_CMR_BCO_RENEG")
#' get_sample_bcocmr(dpob)
#' get_sample_bcocmr(dpob, 300000)
#'
#' dpob <- readRDS("D:/Docs/modelo-behavior-fusion/data/20/01_doper2.rds")
#' dpob <- dpob %>% select(rut, operacion, mes_matriz, marca_inc_12m, comp_n)
#'
#' }
#' @export
get_sample_bcocmr <- function(dpob, nmuestra = 118626, ponderado = TRUE, seed = 1234) {
message("Obteniendo resumen de la poblacion (tasas de malos, ponderados, proporcion por mes)")
# ANOMES total bad good good_w
# <S3: integer64> <int> <dbl> <dbl> <dbl>
# 1 201003 165644 63063 102581 91227.
# 2 201004 136702 53269 83433 73952.
# 3 201005 147270 54208 93062 82700.
# 4 201006 129169 49131 80038 69329.
# 5 201007 144961 50456 94505 82014.
# 6 201008 145079 50365 94714 82798.
# 7 201009 125701 47793 77908 69162.
# 8 201010 139632 50939 88693 79094.
# 9 201011 122905 48076 74829 65959.
# 10 201012 126686 49899 76787 67436.
#
# dpob <- tbl(con, "data_cmr_bco_mora") %>%
# filter(ANOMES >= 201003) %>%
# mutate(
# marca_inc_12m = ifelse(DEF == 1, 1, 0),
# comp_n = ifelse(marca_inc_12m == 1, 12, 12 * INCUMPLIMIENTO)
# ) %>%
# rename(
# rut = RUT, mes_matriz = ANOMES
# ) %>%
# select(rut, mes_matriz, marca_inc_12m, comp_n)
# dpob
if(!ponderado) {
dpob <- dpob %>%
mutate(comp_n = 12)
}
dpob_res <- dpob %>%
mutate(pond = ifelse(marca_inc_12m == 1, 1, comp_n/12)) %>%
group_by(mes_matriz) %>%
summarise(
total = n(),
malos = sum(marca_inc_12m),
buenos = sum(1 - marca_inc_12m),
buenos_pond = sum((1 - marca_inc_12m) * pond)
) %>%
mutate(
total_pond = buenos_pond + malos,
br = malos/total,
br_pond = malos/total_pond
) %>%
collect() %>%
mutate(
prop = total/sum(total),
prop_pond = total_pond/sum(total_pond)
) %>%
modflblla:::clean_df()
dpob_res <- dpob_res %>%
mutate(
muestra_n_bueno = round(prop_pond * nmuestra * (1 - br_pond)),
muestra_n_malo = round(prop_pond * nmuestra * br_pond)
)
# no siempre da justo
diff <- dpob_res %>%
select(muestra_n_bueno, muestra_n_malo) %>%
summarise_if(is.numeric, sum) %>%
mutate(
muestra_n2 = muestra_n_bueno + muestra_n_malo,
diff = nmuestra - muestra_n2
)
diff
diff <- diff %>% pull(diff)
fix <- c(
rep(sign(diff), abs(diff)),
rep(0, 2 * nrow(dpob_res) - abs(diff))
)
set.seed(seed)
fix <- sample(fix)
dpob_res <- dpob_res %>%
mutate(
muestra_n_bueno = muestra_n_bueno + fix[1:nrow(dpob_res)],
muestra_n_malo = muestra_n_malo + fix[(nrow(dpob_res) + 1):(2*nrow(dpob_res))]
)
diff <- dpob_res %>%
select(muestra_n_bueno, muestra_n_malo) %>%
summarise_if(is.numeric, sum) %>%
mutate(
muestra_n2 = muestra_n_bueno + muestra_n_malo,
diff = nmuestra - muestra_n2
)
dsample <- structure(
list(
rut = numeric(0),
operacion = numeric(0),
mes_matriz = numeric(0),
marca_inc_12m = numeric(0),
comp_n = numeric(0)
),
row.names = c(NA, 0L),
class = c("tbl_df", "tbl", "data.frame")
)
message("Obteniendo resumen de clientes (apariciones -> pesos)")
dcli <- dpob %>%
group_by(rut) %>%
summarise(n = n()) %>%
collect() %>%
mutate(weight = max(n) - n + 1) %>%
modflblla:::clean_df()
dcli
meses <- dpob_res %>%
distinct(mes_matriz) %>%
arrange(mes_matriz) %>%
pull(mes_matriz)
message("Ingresando a ciclo")
for(m in rev(meses)) {
message(rep("*", 10))
message("mes: ", m)
# datos a obtener del mes
dpob_mes <- filter(dpob_res, mes_matriz == m)
# poblacion en el mes
dmes <- dpob %>%
filter(mes_matriz == m) %>%
collect() %>%
modflblla:::clean_df()
# dejamos una operacion por rut al azar
set.seed(seed)
dmes <- dmes %>%
sample_frac(1) %>%
distinct(rut, .keep_all = TRUE)
# se eliminan los ruts que ya se seleccionaron previamente
# y se pegan los pesos
dmes <- dmes %>%
anti_join(dsample, by = "rut") %>%
left_join(dcli, by = "rut")
# muestra malos
set.seed(seed)
dmes_malos <- dmes %>%
filter(marca_inc_12m == 1) %>%
sample_n(size = pull(dpob_mes, muestra_n_malo), weight = weight)
# muestra buenos
set.seed(seed)
dmes_buenos <- dmes %>%
filter(marca_inc_12m == 0) %>%
sample_n(size = pull(dpob_mes, muestra_n_bueno), weight = weight)
dmes <- bind_rows(dmes_malos, dmes_buenos)
dmes <- select(dmes, -n, -weight)
dsample <- bind_rows(dsample, dmes)
}
dpob_res <- dsample %>%
group_by(mes_matriz) %>%
summarise(br_muestra = mean(marca_inc_12m), prop_sample = n()) %>%
mutate(prop_sample = prop_sample/sum(prop_sample)) %>%
left_join(dpob_res, ., by = "mes_matriz")
p <- dpob_res %>%
gather(key, value, -mes_matriz) %>%
mutate(mes_matriz = lubridate::ymd(paste0(mes_matriz, "01"))) %>%
ggplot() +
geom_line(aes(mes_matriz, value, color = key)) +
scale_color_viridis_d() +
scale_y_continuous(limits = c(0, NA)) +
facet_wrap(vars(key), scales = "free_y") +
theme_minimal() +
theme(legend.position = "none")
print(p)
dsample
}
#' Obtener tamaio muestral
#' @param N N
#' @param p p
#' @param D D
#' @param alpha alpha
#' @export
get_N_sample <- function(N = 100000, p = 0.1, D = .25/100, alpha = 1/100) {
Z <- qnorm(1 - alpha/2)
message("\tN: ", scales::comma(N))
message("\tp: ", scales::percent(p))
message("\talpha: ", scales::percent(alpha))
message("\tD: ", scales::percent(D))
message("\tZ: ", round(Z, 3))
n <- (N * Z * Z * p * (1 - p)) /
(D * D * (N - 1) + Z * Z * p * (1 - p))
n <- round(n)
message("\tn: ", scales::comma(n))
n
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.