R/process_matriz.R

validate_data_matriz <- function(data){

  names(data) <- tolower(names(data))

  stopifnot("rut" %in% names(data))
  stopifnot("mes_matriz" %in% names(data))
  stopifnot(ncol(data) == 2)

  data <- data %>%
    distinct(rut, mes_matriz) %>%
    mutate_all(as.numeric) %>%
    arrange(mes_matriz, rut) %>%
    mutate(id = row_number()) %>%
    select(everything(), id)

  data

}

get_matriz_aux <- function(drut, periodos) {

  max_p_1 <- max(periodos) + -1
  # x fecha observacion
  # y fecha sbif
  # > --- DIFF positivo --- > (periodo observacion) ----- DIFF negativo --->

  dmat <- left_join(
      add_year_month2(drut),
      tbl(drut$src$con, "matriz_modelos") %>% add_year_month2(),
      by = "rut") %>%
    mutate(DIFF = 12 * (YEAR.x - YEAR.y) + (MONTH.x - MONTH.y)) %>%
    select(ID, RUT, ends_with("x"), ends_with("y"), DIFF, everything()) %>%
    filter(DIFF >= 0, DIFF <= max_p_1)

  name <- "mat_aux"

  remove_if_exists(drut$src$con, name)

  qs <- db_sql_render(dmat$src$con, dmat) %>%
    as.character() %>%
    str_replace("FROM", sprintf("INTO %s FROM", name))

  # cat(qs)

  message("computing ", name)
  dbSendQuery(dmat$src$con, qs)

  dmat <- tbl(dmat$src$con, name)

  dmat

}

get_matriz_join_dlm <- function(dmat, periodos, tblname, calculate_min_mes_matriz = FALSE) {

  get_matrix_summarize_by_per_prod <- function(dmat, producto = list(1, 2, 20, 21, 22, 23, 24, 70), periodo = 6, lbl = "con") {

    message("PERIODO ", periodo, " PRODUCTO ", paste(producto, collapse = "-"))

    prods <- unlist(producto)

    daux <- dmat %>%
      filter(between(DIFF, 0, periodo - 1), producto_riesgo %in% prods) %>%  # count(DIFF) %>% collect() %>% View()
      select(id, operacion, DIFF, sdo = saldo, mra = mora_mes)

    d1 <- daux %>%
      select(id, mra, sdo) %>%
      group_by(id) %>%
      summarise_all(.funs = funs(mean = mean, max = max))

    d2 <- daux %>%
      group_by(id) %>%
      summarise(
        ant = max(DIFF) + 1,
        ops = n_distinct(operacion)
        )

    d3 <- daux %>% filter(mra > 30) %>% group_by(id) %>% summarise(mra_rec_30 = min(DIFF))
    d4 <- daux %>% filter(mra > 60) %>% group_by(id) %>% summarise(mra_rec_60 = min(DIFF))
    d5 <- daux %>% filter(mra > 90) %>% group_by(id) %>% summarise(mra_rec_90 = min(DIFF))
    d6 <- daux %>% filter(mra >120) %>% group_by(id) %>% summarise(mra_rec_120 = min(DIFF))

    daux <- list(d1, d2, d3, d4, d5, d6) %>% reduce(left_join, by = "id")

    vars <- daux$ops$args$vars$alias
    varsnew <- paste(vars, str_pad(lbl, 2, pad = "0"), str_pad(periodo, 2, pad = "0"), sep = "_")
    varsnew[1] <- vars[1]
    varsnew <- as.list(set_names(vars, varsnew))

    daux <- daux %>% select_(.dots = varsnew)

    daux

  }

  dinit <- tbl(dmat$src$con, tblname) %>%
    left_join(tbl(dmat$src$con, "score_antiguedad_cmr"), by = c("rut", "mes_matriz")) %>%
    select(-rut, -mes_matriz)
  # show_query(dinit)

  if(calculate_min_mes_matriz) {

    dinit <- tbl(dmat$src$con, tblname) %>%
      group_by(rut) %>%
      summarise(aux = 1) %>%
      select(-aux) %>%
      left_join(tbl(dmat$src$con, "matriz_modelos")) %>%
      group_by(rut) %>%
      summarise(mes_matriz_min = min(mes_matriz)) %>%
      left_join(tbl(dmat$src$con, tblname), .) %>%
      select(id, mes_matriz_min) %>%
      left_join(dinit, .)
    # show_query(dinit)

  }

  dprod <- data_frame(
    producto = list(
      con = list(1, 2, 20, 21, 22, 23, 24, 70),
      com = list(19, 26, 32, 41),
      ren = list(23),
      lin = list(2),
      aut = list(21),
      cns = list(24),
      # cru = list(41),
      ref = list(22),
      hip = list(60),
      cuo = list(20)
      ),
    lbl = names(producto)
  )

  ddlm <- expand.grid(
    lbl = pull(dprod, lbl),
    periodo = periodos
  ) %>%
    tbl_df() %>%
    left_join(dprod) %>%
    mutate(dmat = list(dmat)) %>%
    select(dmat, producto, periodo, lbl) %>%
    pmap(get_matrix_summarize_by_per_prod) %>%
    reduce(left_join, by = "id", .init = dinit)

  name <- "mat_join_dlm"

  remove_if_exists(dmat$src$con, name)

  qs <- db_sql_render(ddlm$src$con, ddlm) %>%
    as.character() %>%
    str_replace("FROM", sprintf("INTO %s FROM", name))

  str_length(qs)

  message("computing ", name)

  dbSendQuery(dmat$src$con, qs)

  ddlm <- tbl(dmat$src$con, name)

  ddlm


}

get_matriz_tend_by_pers <- function(ddlm, periodos = c(1, 6, 12), tblname) {

  vars <- ddlm$ops$vars  %>%
    str_subset("mean") %>%
    str_replace("_[0-9]+$", "") %>%
    unique()

  miniqs <- expand.grid(v = vars, p1 = periodos, p2 = periodos) %>%
    tbl_df() %>%
    filter(p1 < p2) %>%
    mutate_if(is.factor, as.character) %>%
    mutate_at(vars(p1, p2), str_pad, width = 2, pad = 0) %>%
    mutate(
      a = pmap_chr(., function(v, p1, p2) str_glue("case when {var}_{p2} > 0 then CAST({var}_{p1} AS float)/CAST({var}_{p2} AS float) else NULL end", var = v, p1 = p1, p2 = p2)),
      b = pmap_chr(., function(v, p1, p2) str_glue("{var}_tend_{p1}_{p2}", var = v, p1 = p1, p2 = p2)),
      miniq = map2_chr(a, b, function(a, b) str_glue("{a} as {b}", a = a, b = b))
    ) %>%
    pull(miniq) %>%
    str_c(collapse = ", ")

  # miniqs2 <- expand.grid(v = setdiff(vars, str_subset(vars, "MAX")), p = periodos) %>%
  #   tbl_df() %>%
  #   mutate_if(is.factor, as.character) %>%
  #   mutate(p = str_pad(p, width = 2, pad = 0), v = str_replace_all(v, "_MEAN", "")) %>%
  #   mutate(
  #     a = map2_chr(v, p, function(v, p) str_glue("case when {var}_MAX_{p} > 0 then CAST({var}_MIN_{p} as float)/CAST({var}_MAX_{p} AS float) else -9 end", var = v, p  = p)),
  #     b = map2_chr(v, p, function(v, p) str_glue("{var}_RATIO_MIN_MAX_{p}", var = v, p  = p)),
  #     miniq = map2_chr(a, b, function(a, b) str_glue("{a} as {b}", a = a, b = b))
  #   )%>%
  #   pull(miniq) %>%
  #   str_c(collapse = ", ")

  name <- str_c(tblname, "_variables")

  remove_if_exists(ddlm$src$con, name)

  qs <- str_glue(
    "SELECT B.mes_matriz, B.rut, A.*, { fields } FROM {tbl} A LEFT JOIN {tblori} B ON A.ID = B.ID",
    fields = miniqs,
    # fields2 = miniqs2,
    tbl = as.character(ddlm$ops$x),
    tblori = tblname
  ) %>%
    as.character() %>%
    str_replace("FROM", sprintf("INTO %s FROM", name))

  qs

  message("computing ", name)

  dbSendQuery(ddlm$src$con, qs)

  dvar <- tbl(ddlm$src$con, name)

  dvar

}

#' Function que calcula variables dado una tabla de rut periodo
#'
#' Function que calcula variables dado una tabla de rut periodo y una conexion
#' @param con una conexion odbc
#' @param data data frame con rut y mes_matriz
#' @param periodos vector numerico con periodos a calcular tendencias, resumentes, etc
#' @param collect collect
#' @param calculate_min_mes_matriz calculate_min_mes_matriz
#' @export
get_matriz_vars <- function(con, data, periodos = c(1, 3, 6, 12), collect = TRUE, calculate_min_mes_matriz = FALSE) {

  t0 <- Sys.time()

  message("Starting at: ", as.character(t0))

  dbSendQuery(con, "USE MATRIX")
  remove_if_exists(con, "mat_aux")
  remove_if_exists(con, "mat_join_dlm")
  # dbSendQuery(con, "DBCC SHRINKFILE(MATRIX_log, 1)")


  name <- create_name(data, prefix = "mat")

  data <- validate_data_matriz(data)
  drut <- upload_data(con, data, tblname = name)
  dmat <- get_matriz_aux(drut, periodos)
  ddlm <- get_matriz_join_dlm(dmat, periodos, tblname = name, calculate_min_mes_matriz = calculate_min_mes_matriz)
  dtnd <- get_matriz_tend_by_pers(ddlm, periodos, tblname = name)

  if(collect) {
    message("Collecting ", name)
    dtnd <- collect(dtnd)
  }

  message("Ending at: ", as.character(Sys.time()))
  print(difftime(Sys.time(), t0))

  dtnd

}
jbkunst/modflblla documentation built on June 21, 2019, 12:53 p.m.