R/desc_date.R

Defines functions plot_freq_date_group plot_freq_date plot_missing_date audit_vector_date

Documented in audit_vector_date plot_freq_date plot_freq_date_group plot_missing_date

#'
#' This function plots the total number of records by group. Group with no records are
#' plotted with a red point and a
#' line indicating the expected number of records by hour is also added to the graph.
#'
#' @param x a date vector
#' @param group the type of grouping, one of hour, weekday, month and year.
#' @param title logical, whether to add a title to the graph.
#'
#' @return the plot generated
#'
#' @import data.table
#' @import ggplot2
#' @importFrom stringi stri_trans_totitle
#'
plot_freq_date_group <- function(x, group, title = TRUE) {
  # # check argument
  # if (!is_date(x)) {
  #   stop('"x" must be a date.')
  # }
  group <- match.arg(group, choices = c("hour", "weekday", "month", "year"))
  date_pattern <- c("hour" = "%H", "weekday" = "%A", "month" = "%b", "year" = "%Y")
  min_year <- as.numeric(format(min(x), "%Y"))
  max_year <- as.numeric(format(max(x), "%Y"))
  factor_levels <- list(
    "hour" = c(paste0("0", 0:9), as.character(10:23)),
    "weekday" = format(seq(
      from = as.Date("2018-04-02"),
      to = as.Date("2018-04-08"),
      by = "day"
    ), "%A"),
    "month" = month.abb,
    "year" = as.character(min_year:max_year)
  )
  # aggregate the data by group and count the number of records
  plot_data <- data.table(date = format(x, date_pattern[group]))[, list(count = .N), by = date]
  # merge x and complete sequence of hours to indicate missing dates
  plot_data <- merge(
    x = plot_data,
    y = data.table(date = factor_levels[[group]]),
    by = "date",
    all = TRUE
  )
  # coerce to factor to properly order the graph
  plot_data$date <- factor(plot_data$date, levels = factor_levels[[group]], ordered = TRUE)
  # plot
  date <- count <- NULL # for CMD check
  plot <- ggplot(data = plot_data[!is.na(count)], mapping = aes(x = date, y = count)) +
    geom_point() +
    geom_segment(mapping = aes(xend = date, yend = 0))
  if (nrow(plot_data[is.na(count)]) > 0) {
    plot <- plot +
      geom_point(
        data = plot_data[is.na(count)], mapping = aes(x = date, y = 0),
        color = "red"
      )
  }
  plot <- plot +
    geom_hline(yintercept = length(x) / length(levels(plot_data$date)), linetype = "dashed") +
    labs(x = stringi::stri_trans_totitle(group), y = "Number of records")
  if (group != "hour") {
    plot <- plot + coord_flip()
  }
  if (title) {
    plot <- plot + labs(
      title = "Total number of records by group. Group without data are plotted in red.",
      subtitle = "The dashed line correspond to the expected number of records by group."
    )
  }
  return(plot)
}

#'
#' This function plots the frequency of records by group of dates for every scopes of the
#' date vector.
#'
#' @param x a date vector
#'
#' @return a frequency plot by scope organized in a grid.
#'
#' @details The scope can be hourly, daily, monhly and/or yearly. If a date vector has
#'   more than two different days then it has a daily scope. The same goes for months,
#'   years and hours.
#'
#' @importFrom cowplot ggdraw draw_label plot_grid
#'
plot_freq_date <- function(x) {
  # check argument
  # if (!is_date(x)) {
  #   stop('"x" must be a date.')
  # }
  # indicators of hourly, daily, monthly and yearly data
  hourly <- length(unique(format(x, "%H"))) > 1
  daily <- length(unique(format(x, "%d"))) > 1
  monthly <- length(unique(format(x, "%B"))) > 1
  yearly <- length(unique(format(x, "%Y"))) > 1
  # plot number of records by group
  if (hourly) {
    plot_hourly <- plot_freq_date_group(x, group = "hour", title = FALSE)
  } else {
    plot_hourly <- NULL
  }
  if (daily) {
    plot_daily <- plot_freq_date_group(x, group = "weekday", title = FALSE)
  } else {
    plot_daily <- NULL
  }
  if (monthly) {
    plot_monthly <- plot_freq_date_group(x, group = "month", title = FALSE)
  } else {
    plot_monthly <- NULL
  }
  if (yearly) {
    plot_yearly <- plot_freq_date_group(x, group = "year", title = FALSE)
  } else {
    plot_yearly <- NULL
  }
  plot_list <- list(plot_hourly, plot_daily, plot_monthly, plot_yearly)
  plot_list <- plot_list[!sapply(plot_list, is.null)]
  title <- cowplot::ggdraw() +
    cowplot::draw_label("Total number of records by group.", vjust = 0)
  plots <- cowplot::plot_grid(plotlist = plot_list, ncol = 2)
  annotation <- cowplot::ggdraw() +
    cowplot::draw_label("Group without data are plotted in red. The dashed line correspond to the expected number of records.",
      size = 10
    )
  freq_plot <- cowplot::plot_grid(title, plots, annotation, rel_heights = c(0.1, 1, 0.1), ncol = 1)
  return(freq_plot)
}


#'
#' This function plots the number of records by date and displays the dates without data
#' between the min and max of x.
#'
#' @param x a date
#'
#' @return a plot of the missing dates. Depending on the scope of the date vector it can
#'   be a line chart or a calendar chart.
#'
#' @import data.table
#' @import ggplot2
#' @importFrom lubridate date
#'
plot_missing_date <- function(x) {
  # check argument
  # if (!is_date(x)) {
  #   stop('"x" must be a date.')
  # }
  # indicators of hourly, daily, monthly and yearly data
  daily <- length(unique(format(x, "%d"))) > 1
  monthly <- length(unique(format(x, "%B"))) > 1
  yearly <- length(unique(format(x, "%Y"))) > 1

  # remove missing
  x <- x[!is.na(x)]

  # min
  min_date <- min(x)
  # max
  max_date <- max(x)

  # aggregate the data by day and count the number of records
  date <- NULL # for CMD check
  plot_data <- data.table(date = lubridate::date(x))[, list(count = .N), by = date]

  # the limits of the sequence of dates (month or year)
  if (yearly & !monthly & !daily) {
    from <- as.Date(format(min(x), "%Y-01-01"))
    to <- (seq(as.Date(format(max(x), "%Y-01-01")),
      length.out = 2,
      by = "1 year"
    ) - 1)[2]
    by <- "year"
  } else {
    from <- as.Date(format(min(x), "%Y-%m-01"))
    to <- (seq(as.Date(format(max(x), "%Y-%m-01")),
      length.out = 2,
      by = "1 month"
    ) - 1)[2]
    if (daily) {
      by <- "day"
    } else {
      by <- "month"
    }
  }

  # sequence of dates from min to max
  date_seq <- seq(
    from = from,
    to = to,
    by = by
  )

  # merge x and date_seq to indicate missing dates
  plot_data <- merge(
    x = plot_data,
    y = data.table(date = date_seq),
    by = "date",
    all = TRUE
  )

  # plot
  if (yearly) {
    count <- count_shift <- group <- NULL # for CMD check
    plot_data[, count_shift := shift(count, 1)]
    plot_data[, group := ifelse(xor(is.na(count), is.na(count_shift)), 1, 0)]
    plot_data[, group := cumsum(group)]
    plot <- ggplot(data = plot_data, mapping = aes(x = date, y = count, group = group)) +
      geom_line()
    if (nrow(plot_data[is.na(count)]) > 0) {
      plot <- plot +
        geom_point(
          data = plot_data[is.na(count)][date > min_date & date < max_date],
          mapping = aes(x = date, y = 0),
          color = "red"
        )
    }
    plot <- plot +
      labs(
        x = "Date", y = "Number of records",
        title = "Number of records by date. Dates without data are plotted as red points."
      )
  } else if (monthly & !daily) {
    plot_data$month <- format(plot_data$date, "%B")
    plot_data$month <- factor(plot_data$month, levels = month.name, ordered = TRUE)

    # plot
    month <- NULL # for CMD check
    plot <- ggplot(data = plot_data[!is.na(count)], mapping = aes(x = month, y = count, group = group)) +
      geom_point() +
      geom_segment(mapping = aes(xend = month, yend = 0))
    if (nrow(plot_data[is.na(count)]) > 0) {
      plot <- plot +
        geom_point(
          data = plot_data[is.na(count)][date > min_date & date < max_date],
          mapping = aes(x = month, y = 0),
          color = "red"
        )
    }
    plot <- plot +
      coord_flip() +
      labs(
        x = "Number of recods", y = "Month",
        title = "Number of records by month. Months without data are plotted in red."
      ) +
      theme(
        strip.background = element_rect(fill = NA, color = NA),
        strip.text.x = element_text(hjust = 0, face = "bold")
      )
  } else if (daily) {
    plot_data$dow <- format(plot_data$date, "%a")
    levels_dow <- format(seq(
      from = as.Date("2018-01-01"),
      to = as.Date("2018-01-07"),
      by = "day"
    ), "%a")
    plot_data$dow <- factor(plot_data$dow,
      levels = levels_dow,
      ordered = TRUE
    )
    plot_data$dom <- format(plot_data$date, "%d")
    plot_data$woy <- as.numeric(format(plot_data$date, "%W"))
    plot_data$month <- format(plot_data$date, "%B")
    plot_data$month <- factor(plot_data$month,
      levels = month.name,
      ordered = TRUE
    )

    # plot
    dow <- woy <- dom <- NULL # for CMD check
    plot <- ggplot(
      data = plot_data,
      mapping = aes(x = dow, y = -woy, fill = count, label = dom)
    ) +
      geom_tile(color = "gray80") +
      geom_text() +
      scale_fill_gradient(name = "Number of 
records", na.value = "transparent") +
      scale_x_discrete(position = "top") +
      facet_wrap(~month, ncol = 4, scales = "free") +
      labs(title = paste0("Number of records by day. Days without records are colored in white.")) +
      theme(
        panel.background = element_rect(fill = NA, color = NA),
        strip.background = element_rect(fill = NA, color = NA),
        strip.text.x = element_text(hjust = 0, face = "bold"),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text.y = element_blank(),
        strip.placement = "outsite"
      )
  }
  return(plot)
}

#'
#' This functions desxribes a date vector. Summary statistics are given and frequency and
#' missing plots are generated.
#'
#' @param x a date vector (can be of class Date and POSIXct)
#' @param plot logical, whether to plot the graph. Defaults to TRUE.
#'
#' @return a table with summary statistics (length, number of na, , number of unique, min
#'   and max) and if plot is true the frequency plot from plot_freq_date and a plot of the
#'   missing values from plot_missing_date.
#'
#' @import ggplot2
#'
audit_vector_date <- function(x, plot = TRUE) {
  # check argument
  # if (!is_date(x)) {
  #   stop('"x" must be a date.')
  # }
  # summary statistics
  # length
  length <- length(x)
  # remove NAs
  x <- x[!is.na(x)]
  # number unique dates
  n_unique <- length(unique(x))
  # number of NAs
  n_na <- length - length(x)
  # min
  min_date <- min(x)
  # max
  max_date <- max(x)

  # summary output
  summary_output <- data.frame(
    Indicator = c(
      "Length", "Number of NAs", "Number of unique values", "Minimum date",
      "Maximum date"
    ),
    Value = c(length, n_unique, n_na, as.character(min_date), as.character(max_date))
  )

  # plot number of records by group
  freq_plot <- plot_freq_date(x)

  # visualize number of records and missing by date
  missing_plot <- plot_missing_date(x)

  if (plot) {
    print(freq_plot)
    print(missing_plot)
  }
  invisible(list(
    summary_stat = summary_output,
    freq_plot = freq_plot,
    missing_plot = missing_plot
  ))
}
MathieuMarauri/auditdata documentation built on March 6, 2020, 7:09 p.m.