## Load library
library(data.table)
library(rmarkdown)
library(knitr)

## Set knitr options
opts_chunk$set(
  fig.width = 14,
  fig.height = 10,
  echo = FALSE,
  results = "asis",
  warning = TRUE
)

## Get user data
data <- params$data
report_config <- params$report_config
response <- params$response

## Create helper functions
do_call <- function(fun_name, na_omit = FALSE) {
  if (na_omit) data <- na.omit(data)
  do.call(fun_name, c(list("data" = data), report_config[[fun_name]]))
}

## Create global data
intro <- do_call("introduce")
if ("introduce" %in% names(report_config)) {
  memory_usage <- intro[["memory_usage"]]
  class(memory_usage) <- "object_size"
  intro_df <- data.frame(
    "Name" = c("Rows", "Columns",
               "Discrete columns", "Continuous columns", "All missing columns",
               "Missing observations", "Complete Rows",
               "Total observations", "Memory allocation"),
    "Value" = c(
      format(intro[["rows"]], big.mark = ","),
      format(intro[["columns"]], big.mark = ","),
      format(intro[["discrete_columns"]], big.mark = ","),
      format(intro[["continuous_columns"]], big.mark = ","),
      format(intro[["all_missing_columns"]], big.mark = ","),
      format(intro[["total_missing_values"]], big.mark = ","),
      format(intro[["complete_rows"]], big.mark = ","),
      format(intro[["total_observations"]], big.mark = ","),
      format(memory_usage, unit = "auto")
    )
  )
  ## Render content
  cat("### Basic Statistics", fill = TRUE)
  cat("#### Raw Counts", fill = TRUE)
  kable(intro_df)
}
if ("plot_intro" %in% names(report_config)) {
  cat("#### Percentages", fill = TRUE)
  do_call("plot_intro")
}
if ("plot_str" %in% names(report_config)) {
  str_object <- do.call(plot_str, c(list("data" = data, "max_level" = report_config[["plot_str"]][["max_level"]], "print_network" = FALSE)))
  plot_option <- report_config[["plot_str"]][setdiff(names(report_config[["plot_str"]]), c("type", "max_level", "print_network"))]
  ## Render content
  cat("### Data Structure", fill = TRUE)
  if (report_config[["plot_str"]][["type"]] == "radial") {
    do.call(radialNetwork, c(list("List" = str_object), plot_option))
  } else {
    do.call(diagonalNetwork, c(list("List" = str_object), plot_option))
  }
}
if ("plot_missing" %in% names(report_config)) {
  cat("### Missing Data Profile", fill = TRUE)
  do_call("plot_missing")
}
if (any(c("plot_bar", "plot_histogram", "plot_density", "plot_qq") %in% names(report_config))) {
  cat("### Univariate Distribution", fill = TRUE)
}
if ("plot_histogram" %in% names(report_config)) {
  if (intro[["continuous_columns"]] > 0) {
    cat("#### Histogram", fill = TRUE)
    do_call("plot_histogram")
  }
}
if ("plot_density" %in% names(report_config)) {
  if (intro[["continuous_columns"]] > 0) {
    cat("#### Density Estimates", fill = TRUE)
    do_call("plot_density")
  }
}
if ("plot_bar" %in% names(report_config)) {
  if (intro[["discrete_columns"]] > 0) {
    bar_option <- report_config[["plot_bar"]][setdiff(names(report_config[["plot_bar"]]), "with")]
    cat("#### Bar Chart (by frequency)", fill = TRUE)
    do.call(plot_bar, c(list("data" = data), bar_option))
  }
}
if ("plot_bar" %in% names(report_config)) {
  if (intro[["discrete_columns"]] > 0) {
    if (!is.null(response)) {
      if (is.numeric(data[[response]])) {
        cat(paste0("#### Bar Chart (by ", response, ")"), fill = TRUE)
        do.call(plot_bar, c(list("data" = data, "with" = response), bar_option))
      }
    }
  }
}
if ("plot_bar" %in% names(report_config)) {
  if (intro[["discrete_columns"]] > 0) {
    with_value <- report_config[["plot_bar"]][["with"]]
    if (!is.null(with_value)) {
      if (response != with_value) {
        cat(paste0("#### Bar Chart (by ", with_value, ")"), fill = TRUE)
        do_call("plot_bar")
      }
    }
  }
}
if ("plot_qq" %in% names(report_config)) {
  if (intro[["continuous_columns"]] > 0) {
    qq_option <- report_config[["plot_qq"]][setdiff(names(report_config[["plot_qq"]]), "by")]
    cat("#### QQ Plot", fill = TRUE)
    do.call(plot_qq, c(list("data" = data), qq_option))
  }
}
if ("plot_qq" %in% names(report_config)) {
  if (intro[["continuous_columns"]] > 0) {
    if (!is.null(response)) {
      cat(paste0("#### QQ Plot (by ", response, ")"), fill = TRUE)
      do.call(plot_qq, c(list("data" = data, "by" = response), qq_option))
    }
  }
}
if ("plot_qq" %in% names(report_config)) {
  if (intro[["continuous_columns"]] > 0) {
    by_value <- report_config[["plot_qq"]][["by"]]
    if (!is.null(by_value)) {
      if (response != by_value) {
        cat(paste0("#### QQ Plot (by ", by_value, ")"), fill = TRUE)
        do_call("plot_qq")
      }
    }
  }
}
if ("plot_correlation" %in% names(report_config)) {
  cat("### Correlation Analysis", fill = TRUE)
  if (intro[["complete_rows"]] > 0) {
    do_call("plot_correlation", na_omit = TRUE)
  } else {
    message("Insufficient complete rows! Skipping correlation analysis.")
  }
}
if ("plot_prcomp" %in% names(report_config)) {
  cat("### Principal Component Analysis", fill = TRUE)
  if (intro[["complete_rows"]] > 0) {
    if (!is.null(response)) {
      dt <- na.omit(data[, -response, with = FALSE])
      do.call(plot_prcomp, c(list("data" = dt), report_config[["plot_prcomp"]]))
    } else {
      do_call("plot_prcomp", na_omit = TRUE)
    }
  } else {
    message("Insufficient complete rows! Skipping principal component analysis.")
  }
}
if (any(c("plot_boxplot", "plot_scatterplot") %in% names(report_config)) & !is.null(response)) {
  cat("### Bivariate Distribution", fill = TRUE)
}
if ("plot_boxplot" %in% names(report_config)) {
  boxplot_option <- report_config[["plot_boxplot"]][setdiff(names(report_config[["plot_boxplot"]]), "by")]
  if (!is.null(response)) {
    cat(paste0("#### Boxplot (by ", response, ")"), fill = TRUE)
    do.call(plot_boxplot, c(list("data" = data, "by" = response), boxplot_option))
  }
}
if ("plot_boxplot" %in% names(report_config)) {
  by_value <- report_config[["plot_boxplot"]][["by"]]
  if (!is.null(by_value)) {
    if (response != by_value) {
      cat(paste0("#### Boxplot (by ", by_value, ")"), fill = TRUE)
      do_call("plot_boxplot")
    }
  }
}
if ("plot_scatterplot" %in% names(report_config)) {
  scatterplot_option <- report_config[["plot_scatterplot"]][setdiff(names(report_config[["plot_scatterplot"]]), "by")]
  if (!is.null(response)) {
    cat(paste0("#### Scatterplot (by ", response, ")"), fill = TRUE)
    do.call(plot_scatterplot, c(list("data" = data, "by" = response), scatterplot_option))
  }
}
if ("plot_scatterplot" %in% names(report_config)) {
  by_value <- report_config[["plot_scatterplot"]][["by"]]
  if (!is.null(by_value)) {
    if (response != by_value) {
      cat(paste0("#### Scatterplot (by ", by_value, ")"), fill = TRUE)
      do_call("plot_scatterplot")
    }
  }
}


boxuancui/eda documentation built on Jan. 11, 2020, 11:14 a.m.