R/time_series_pdf.R

#' @title Export ggplot with city time series to pdf file
#' @description The function helps to create pdf with ggplot for visual checking the quality of city data
#' @param dataset Dataframe with mandatory fields : year (integer), location (integer), indicator (c-code in character), industry (character) or okved (character)
#' @param folder  Path in string for saving pdf-document
#' @param filename   filename, like "all2",
#' @param begin  First year for charts plot
#' @param end    Last year for charts plot
#' @param lag    The numbers of intervals which splite time period between "begin" & "end" years to render on x-axix
#' @param oktmo_style Boolean, TRUE (by default) if title should include oktmo instead of city name. FALSE if city name on title preferable
#' @param scn_vec Vector with column's names needed to be displayed on chart. e.g. c ("fact", "dyn_calc')
#' @param kpi_vec Vector of C-codes in character like c("C019", "C475") or "All" in cases ploting all indicators on 1 per plot
#' @param ind_vec Vector with the code of industries & okved like c("1", "5") or "All" in cases ploting all industies in 1 pdf section."AllCity" by default if no industry column
#' @usage time_series_pdf(dataset, folder = getwd(), filename = NULL, begin = 2010, end = 2020, lag = 5, oktmo_style = FALSE, scn_vec = NULL, kpi_vec = "All", ind_vec = "All")
#' @return Pdf-file with ggplots in folder
#' @importFrom dplyr %>%
#' @importFrom DBI dbCanConnect
#' @importFrom RPostgres Postgres
#' @importFrom ggplot2 ggplot
#' @importFrom dplyr left_join
#' @importFrom dplyr distinct
#' @importFrom dplyr case_when
#' @importFrom ggplot2 geom_line
#' @importFrom ggplot2 geom_path
#' @importFrom ggplot2 scale_y_continuous
#' @importFrom ggplot2 scale_x_continuous
#' @importFrom ggplot2 geom_ribbon
#' @importFrom ggplot2 geom_point
#' @importFrom ggplot2 labs
#' @importFrom ggplot2 theme
#' @importFrom ggforce facet_wrap_paginate
#' @importFrom ggplot2 ggproto
#' @importFrom ggforce n_pages
#' @export
#' @examples
#'
#' path <- 'https://www.dropbox.com/s/x0izu2v5b1jrd7d/sd_sample.csv?dl=1'
#' sd <- read.table(path, header = T, sep = ";")
#' time_series_pdf(dataset     = sd, folder = getwd(), begin = 2000, end = 2030, lag = 1, oktmo_style = F, kpi_vec = c ("C475", "C499"), ind_vec = "All")
time_series_pdf <- function (dataset,      # Пакет данных для вывода dataset <- sd
                              folder      = getwd(),
                              filename    = NULL,
                              begin       = 2010,
                              end         = 2020,
                              lag         = 5,
                              oktmo_style = FALSE,
                              scn_vec     = NULL,  # c ("fact", "dyn_calc')
                              kpi_vec     = "All", # kpi_vec <- c ("C475", "C499")
                              ind_vec     = "AllCity") {  # ind_vec <- c ("4", "6")  ind_vec <- "All" (не всего, но все отрасли)

  require(ggforce)
  {
    sprav <- read_all_prosha()$dwh_prod
    sprav$okved <- sprav$okved %>% as_tibble() %>% mutate (industry_geoveb = as.character(industry_geoveb),
                                                           industry_geoveb = ifelse(okved == "всего", "AllCity", industry_geoveb))
  } # 1. Считывание и обработка справочников  test <- sprav$okved
  {
    # dataset %>% slice_sample(n= 1) %>% paste0(collapse = "','")
    column_names <- names(dataset) #%>% paste0(collapse = ",")
    dataset1 <- dataset # Убрать и оставить только dataset - для уменьшения памяти
  } # 2. Забираем данные
  {# Обрабатываем все что связано с полем year
    if (!all(c("year", "location", "indicator") %in% column_names))   stop("No mandatory fields")
    if (begin > end || begin < min (distinct(dataset,year)) || end > max (distinct(dataset,year))) stop ("Wrong time frame")
    if (lag > (end-begin)) stop("To wide time lag")
    dataset1 <- dataset1 %>% filter (year >= begin, year <= end) # Уберем лишние годы
    # Обрабатываем все что связано с полем location
    dataset1 <- dataset1 %>% mutate(location = as.character(location))
    # Обрабатываем все что связано с полем indicator
    if (!"All" %in% kpi_vec & nrow(distinct(dataset,indicator) %>% filter(indicator %in% kpi_vec)) < length(kpi_vec) ) stop ("Kpi vector out of dataset's range")
    if (!all(kpi_vec %in% 'All')) dataset1 <- dataset1 %>% filter (indicator %in% kpi_vec) # Уберем лишние вектора
    # Обрабатываем все что связано с полем industry or okved
    if (any(column_names %in% "okved")) {
      dataset1 <- dataset1 %>% left_join(sprav$okved %>% select (okved,           ind_name = long_name           ) %>% distinct(okved          , ind_name), by = c("okved"    = "okved"))}
    else {
      if (any(column_names %in% "industry")) dataset1 <- dataset1 %>% left_join(sprav$okved %>% select (industry_geoveb, ind_name = industry_name_geoveb) %>% distinct(industry_geoveb, ind_name), by = c("industry" = "industry_geoveb"))
      else{
        dataset1 <- dataset1 %>% mutate (industry = "AllCity", ind_name = "AllCity")
      }}
    if (any(column_names %in% "okved")) dataset1 <- dataset1 %>% rename ("industry" = "okved") # для выгрузок в которых вместо индустрий находятся оквэды переходим на единое именование колонки industry
    # if ((!"All" %in% ind_vec & !"всего" %in% ind_vec & (any(column_names %in% "okved")) ) & nrow(distinct(dataset,okved) %>% filter(okved %in% ind_vec)) < length(ind_vec) ) stop ("OKVED vector out of dataset's range")
    # if ((!"All" %in% ind_vec & !"всего" %in% ind_vec & (any(column_names %in% "industry")) ) & nrow(distinct(dataset,industry) %>% filter(industry %in% ind_vec)) < length(ind_vec) ) stop ("Industry vector out of dataset's range")
    if (ind_vec == "AllCity" & any(column_names %in% "industry") )  dataset1 <- dataset1 %>% filter (industry == "AllCity")
    if (ind_vec == "AllCity" & any(column_names %in% "okved"   ) )  dataset1 <- dataset1 %>% filter (okved == "AllCity")
    if (ind_vec == "all"   & any(column_names %in% "industry") )  dataset1 <- dataset1 %>% filter (industry != "AllCity")
    if (ind_vec == "all"   & any(column_names %in% "okved"   ) )  dataset1 <- dataset1 %>% filter (okved != "AllCity")
    if (ind_vec != "AllCity" &  ind_vec != "All")  dataset1 <- dataset1 %>% filter (industry %in% ind_vec)

  } # 3. Проверяем содержимое датасета
  {
    dataset1 <- dataset1 %>%
      oktmo_restore("location") %>%
      left_join(sprav$indicator %>% as_tibble() %>% select (code , kpi_name = long_name, unit_id), by = c("indicator" = "code" )) %>%
      left_join(sprav$location  %>% as_tibble() %>% select (oktmo, loc_name = long_name         ), by = c("location"  = "oktmo")) %>%
      left_join(sprav$unit      %>% as_tibble() %>% select (id, unit = unique_name              ), by = c("unit_id"   = "id")) %>%
      mutate (ind_name   = ifelse ( any(column_names %in% "industry" || column_names %in% "okved"), "AllCity", str_remove(ind_name, "Деятельность ")),
              ind_name   = ifelse (industry == "AllCity", "AllCity", str_sub(ind_name, end = 20L)),
              # ind_name   = paste0(industry, ".", ind_name),
              kpi_name   = str_remove(kpi_name, "-ПОКАЗАТЕЛЬ ОТМЕНЕН- "),
              loc_name   = str_replace(loc_name, "агломерация", "аг."),
              loc_name   = str_replace(loc_name, "район", "р."),
              loc_name   = str_sub (loc_name, end = 20L),
              year       = as.numeric(year),
              ind_name   = as.factor (ind_name),
              kpi_name   = as.factor (kpi_name) #,city_title = as.factor(city_title)
      )
    if(oktmo_style) dataset1 <- dataset1 %>% mutate (city_title = as.factor(location)) else dataset1 <- dataset1 %>% mutate (city_title = as.factor(loc_name))

    dataset1 <- dataset1 %>% mutate (facet_ind  = paste0(ind_name, ": ", city_title))

  } # 4. Пристыковываем имена полей из справочников
  {
    ggplot_prep     <- function(dataset2, kpi_set,
                                begin       = 2010,
                                end         = 2020,
                                lag         = 5){ #dataset2 <- dataset1  kpi_set <-"C013" c ("C009", "C011")

      kpi_title <- dataset2 %>% filter(indicator %in% kpi_set) %>%
        distinct(indicator , kpi_name, unit) %>%
        mutate (n = paste0(indicator, ", ", kpi_name, ", ", unit)) %>% select(n) %>% deframe() %>% paste0(collapse = "\n")

      dataset2 <- dataset2 %>% filter(indicator %in% kpi_set)

      if (all(distinct(dataset2,industry) == "AllCity")){
        dataset2   <- dataset2 %>% mutate (facet_ind  = paste0(city_title))
        page_title <- kpi_title
        page_subtitle <- "По городу в целом"
      } else {
        dataset2   <- dataset2 %>% mutate (facet_ind  = paste0(ind_name, ": ", city_title))
        page_title <- "Показатели по отрасли в каждом городе"
        page_subtitle <- kpi_title} # выбор фасета в зависимости от датасета

      version <- ifelse(any(names(dataset) %in% "version"), distinct(dataset, version)[1], "undf")

      f <- ggplot(dataset2)
      {
        if ("value"      %in% names(dataset2))                                                {f <- f + geom_line  (aes(x = year, y    = value    , color = kpi_name)                    , size = .5)}

        if (("dyn"       %in% names(dataset2) & is.null(scn_vec)) || "dyn"      %in% scn_vec) {f <- f + geom_line  (aes(x = year, y    = dyn      , color = "Динамическая")              , size = .5)}
        if (("dyn_calc"  %in% names(dataset2) & is.null(scn_vec)) || "dyn_calc" %in% scn_vec) {f <- f + geom_line  (aes(x = year, y    = dyn_calc , color = "Динамическая")              , size = .5)}
        if (("dyn_use"   %in% names(dataset2) & is.null(scn_vec)) || "dyn_use"  %in% scn_vec) {f <- f + geom_line  (aes(x = year, y    = dyn_use  , color = "Дин. с фактами")            , size = .5)}

        if (("calc"      %in% names(dataset2) & is.null(scn_vec)) || "calc"     %in% scn_vec) {f <- f + geom_line  (aes(x = year, y    = calc     , color = "Регрессионный")             , size = .5)}
        if (("sk_calc"   %in% names(dataset2) & is.null(scn_vec)) || "sk_calc"  %in% scn_vec) {f <- f + geom_line  (aes(x = year, y    = sk_calc  , color = "Регрессионный каркас")      , size = .5, linetype = "dashed")}
        if (("sk_use"    %in% names(dataset2) & is.null(scn_vec)) || "sk_use"   %in% scn_vec) {f <- f + geom_line  (aes(x = year, y    = sk_use   , color = "Регр. каркас с фактами ")   , size = .5, linetype = "dashed")}

        if (("fact"      %in% names(dataset2) & is.null(scn_vec)) || "fact"     %in% scn_vec) {f <- f + geom_point (aes(x = year, y    = fact     , color = "Факты"       )              , size = 1 )}
        if (("d_fact"    %in% names(dataset2) & is.null(scn_vec)) || "d_fact"   %in% scn_vec) {f <- f + geom_point (aes(x = year, y    = fact     , color = "D-факты"     )              , size = 1 )}

        if (("low"       %in% names(dataset2) & is.null(scn_vec)) || "low"      %in% scn_vec) {f <- f + geom_ribbon(aes(x = year, ymin = low    , ymax = high     ), color = "grey70", alpha = .3)}
        if (("dyn_high"  %in% names(dataset2) & is.null(scn_vec)) || "dyn_high" %in% scn_vec) {f <- f + geom_ribbon(aes(x = year, ymin = dyn_low, ymax = dyn_high ), color = "grey70", alpha = .3)}

      } # Выбор сценария выводмого значения
      {f <- f +
          scale_y_continuous(labels = scales::comma) +
          scale_x_continuous(breaks = seq(from = begin, to = end, by = lag) ) +
          theme(plot.title      = element_text (face = "bold", size = 14, hjust = 0.5),
                plot.subtitle   = element_text (               size = 8 , hjust = 0.5),
                strip.text      = element_text(size = 8),
                axis.title.y    = element_blank(), axis.title.x    = element_blank(),
                legend.position = "bottom", legend.text = element_text (size = 8), legend.title = element_text (size = 8),
                axis.text.y     = element_text (size = 5), axis.text.x     = element_text (size = 5)) +
          labs(title    = page_title ,
               subtitle = page_subtitle,
               color    = "Легенда: ", x = "",
               caption  = paste0("Версия ", version ,", от ", Sys.time()))
      } # Оформление

      if(nrow(distinct(dataset2, location)) > 1){
        f <- f + ggforce:::facet_wrap_paginate(~ facet_ind, ncol = 6, nrow = 6, page = 1, scales = "free", labeller = labeller(facet_ind = label_wrap_gen(width = 30)))} else{
          f <- f + ggforce:::facet_wrap_paginate(~ facet_ind, ncol = 1, nrow = , page = 1, scales = "free", labeller = labeller(facet_ind = label_wrap_gen(width = 30)))}


      return(f)
    } # Функция подготовки основного графика
    ggplot_add_page <- function(dataset , kpi_set, begin, end, lag){
      f <- ggplot_prep(dataset, kpi_set, begin = begin, end = end, lag = lag)

      if(nrow(distinct(dataset %>% filter(indicator %in% kpi_set), location)) > 1){
        num <- ggforce:::n_pages(f)
        for(i in 1:num){
          print(f +
                  ggforce:::facet_wrap_paginate(~ facet_ind, ncol = 6, nrow = 6, page = i,
                                                scales = "free",
                                                labeller = labeller(facet_ind = label_wrap_gen(width = 30))) )

          cat(i, " page from ", num, " pages \r\n") }

      } else f


      # cat(paste0("Документ сохранен в папке ", folder, "\n под именем "))
    } # Функция печати на отдельную страницу


  } # 6. Функция создания ggplot для
  {
    ver <- ifelse(any(names(dataset1) %in% "version"), distinct(dataset1, version)[1], "undf")
    name_pdf <- case_when(
      ind_vec == "AllCity"  ~ paste0("All_", "v.", ver, "_", ".pdf"),
      TRUE                ~ paste0("Industry_",  paste0(ind_vec, collapse="_"), ".KPI_", paste0(kpi_vec, collapse="_"), "_v.", ver, "_", ".pdf"))

    if(!is.null(filename)) name_pdf <- paste0(filename, ".pdf")

    cairo_pdf(paste0(folder, "/",name_pdf), h = 210/25.4, w = 297/25.4, onefile = TRUE, fallback_resolution = 120)
    # pdf(name_pdf, height = 210/25.4, width = 297/25.4, onefile = TRUE, encoding="MacRoman")

    kpis <- unique(dataset1$indicator, 2) #%>% head(2) %>% print()
    jmax <- length(kpis)

    num_cities <- unique(dataset1$location, 2) %>% length() #%>% print()
    num_indus  <- unique(dataset1$industry, 2) %>% length() #%>% print()

    if(ind_vec == "AllCity" || jmax < 4){ # Когда нужен 1 показатель на график
      cat(paste0("Ориентировочная скорость вывода ", Sys.time() + num_cities*jmax*num_indus * 0.254386, " \r\n" ))
      for (j in 1:jmax){
        cat (paste0("Показатель: ", kpis[j], ". ", j , "-й из ", jmax, " показателей \r\n"))
        ggplot_add_page (dataset1, kpis[j], begin = begin, end = end, lag = lag)
      }} else { ggplot_add_page (dataset1, kpi_vec, begin = begin, end = end, lag = lag)} # Когда нужны несколько показателей на графике

    dev.off()

  } # 7. Печатаем страницы

}
St-Digital-Twin/Dtwin documentation built on Jan. 1, 2022, 8:11 p.m.