In phuongquan/daiquiri: Data Quality Reporting for Temporal Datasets

knitr::opts_chunk$set(echo = FALSE)
knitr::opts_chunk$set(comment = NA)
knitr::opts_chunk$set(rownames.print = FALSE)

```{css daiquiri-styles, echo=FALSE} h1.title { font-size: 28px; } p.compact { margin-bottom: 0px; margin-top: 0px; }

```r
stratify <- !is.null(params$source_data$strata_field_name)
strata_labels <- params$source_data$strata_labels

Dataset: **`r encodeString(params$source_data$dataset_description)`**

Aggregated by: **`r params$aggregated_data$aggregation_timeunit`** `r if (stratify) {paste0("; Stratified by: ", params$source_data$strata_field_name, "")}`

Report created on: `r Sys.time()` ; daiquiri version `r utils::packageVersion(utils::packageName())` ; `r R.Version()$version.string`

{.tabset .tabset-pills}

Source data {.tabset}

sourcesummary <- summarise_source_data(params$source_data,
  show_progress = FALSE
)
source_data_fields <- sourcesummary$data_fields
source_data_fields$ordinal_position <- seq.int(nrow(source_data_fields))

stratify <- !is.null(params$source_data$strata_field_name)
strata_labels <- params$source_data$strata_labels

Data fields imported

reactable::reactable(
  # NOTE: trimws() needed as summarise_source_data() is optimised for console printout
  source_data_fields[
    which(trimws(sourcesummary$data_fields$field_type) != "ignore"),
  ],
  sortable = TRUE,
  filterable = FALSE,
  searchable = FALSE,
  rownames = FALSE,
  pagination = FALSE,
  striped = TRUE,
  highlight = TRUE,
  columns = list(
    field_name = reactable::colDef(
      name = "Field name",
      style = list(fontWeight = "bold")
    ),
    field_type = reactable::colDef(name = "Field type"),
    datatype = reactable::colDef(show = FALSE),
    count = reactable::colDef(name = "Total values"),
    missing = reactable::colDef(name = "Missing values"),
    min = reactable::colDef(name = "Min value"),
    max = reactable::colDef(name = "Max value"),
    validation_warnings = reactable::colDef(name = "Validation warnings"),
    ordinal_position = reactable::colDef(name = "Column position")
  )
)

Data fields ignored

reactable::reactable(
  # NOTE: trimws() needed as summarise_source_data() is optimised for console printout
  source_data_fields[
    which(trimws(sourcesummary$data_fields$field_type) == "ignore"),
    c("field_name", "field_type", "ordinal_position")
  ],
  sortable = TRUE,
  filterable = FALSE,
  searchable = FALSE,
  pagination = FALSE,
  rownames = FALSE,
  compact = TRUE,
  fullWidth = FALSE,
  columns = list(
    field_name = reactable::colDef(
      name = "Field name",
      minWidth = 200
    ),
    field_type = reactable::colDef(
      name = "Field type",
      minWidth = 100
    ),
    ordinal_position = reactable::colDef(name = "Column position")
  )
)

Validation warnings

reactable::reactable(
  sourcesummary$validation_warnings,
  sortable = TRUE,
  filterable = FALSE,
  searchable = FALSE,
  rownames = FALSE,
  pagination = FALSE,
  striped = TRUE,
  highlight = TRUE,
  compact = TRUE,
  fullWidth = FALSE,
  columns = list(
    field_name = reactable::colDef(
      name = "Field name",
      style = list(fontWeight = "bold"),
      minWidth = 200
    ),
    message = reactable::colDef(
      name = "Details",
      minWidth = 500
    ),
    instances = reactable::colDef(name = "Instances")
  )
)

Summary

summarydf <- data.frame(
  "Item" = c(
    "Columns in source",
    "Columns imported",
    "Column used for timepoint",
    "Min timepoint value",
    "Max timepoint value",
    "Rows in source",
    "Duplicate rows",
    "Rows missing timepoint values",
    "Rows imported",
    if (stratify) {
      c("Column used for strata",
        "Strata values")      
    },
    "Strings interpreted as missing values",
    "Total validation warnings"
  ),
  "Value" = c(
    sourcesummary$overall["cols_source_n"],
    sourcesummary$overall["cols_imported_n"],
    sourcesummary$overall["timepoint_field_name"],
    sourcesummary$overall["timepoint_min"],
    sourcesummary$overall["timepoint_max"],
    sourcesummary$overall["rows_source_n"],
    sourcesummary$overall["rows_duplicates_n"],
    sourcesummary$overall["timepoint_missing_n"],
    sourcesummary$overall["rows_imported_n"],
    if (stratify) {
      c(sourcesummary$overall["strata_field_name"],
        gsub(", ", "<br>", sourcesummary$overall["strata_labels"], fixed = TRUE))      
    },
    gsub("\n", "<br>", sourcesummary$overall["na_values"], fixed = TRUE),
    sum(sourcesummary$validation_warnings$instances, na.rm = TRUE) +
      sum(is.na(
        sourcesummary$validation_warnings$instances
      ))
  ),
  stringsAsFactors = FALSE
)

reactable::reactable(
  summarydf,
  sortable = FALSE,
  filterable = FALSE,
  searchable = FALSE,
  pagination = FALSE,
  rownames = FALSE,
  striped = TRUE,
  compact = TRUE,
  fullWidth = FALSE,
  columns = list(
    Item = reactable::colDef(
      name = "",
      style = list(fontWeight = "bold"),
      minWidth = 300
    ),
    Value = reactable::colDef(
      name = "",
      minWidth = 200,
      html = TRUE
    )
  )
)

Aggregated data {.tabset}

aggsummary <- summarise_aggregated_data(params$aggregated_data)
agg_fields <- params$aggregated_data$aggregated_fields
agg_fields_strat <- params$aggregated_data$aggregated_fields_stratified

if (stratify) {
  cat("\n### Strata {.tabset}\n")

  cat("\n#### Number\n")
    p <-
      plot_subcat_heatmap_static(
        agg_field = agg_fields[[params$source_data$strata_field_name]],
        aggregation_function = "stratum_n"
      )
    print(p)
    cat("\n")

  cat("\n#### Percentage\n")
    p <-
      plot_subcat_heatmap_static(
        agg_field = agg_fields[[params$source_data$strata_field_name]],
        aggregation_function = "stratum_perc"
      )
    print(p)
    cat("\n")
}

Values present {.tabset}

p_all <-
  plot_overview_combo_static(
    agg_fields = agg_fields,
    aggregation_function = "n",
    lineplot_field_name = params$aggregated_data$timepoint_field_name,
    title = paste("Records per", params$aggregated_data$aggregation_timeunit)
  )

if (!stratify) {
  print(p_all)
  cat("\n")
} else {
  # first tab covers entire dataset
  cat("\n####", "All\n")
  print(p_all)
  cat("\n")

  # following tabs show individual strata
  for (i in seq_along(strata_labels)) {
    cat("\n####", strata_labels[i], "\n")
    p <-
      plot_overview_combo_static(
        agg_fields = agg_fields_strat,
        aggregation_function = "n",
        lineplot_field_name = params$aggregated_data$timepoint_field_name,
        title = paste("Records per", params$aggregated_data$aggregation_timeunit),
        stratum = strata_labels[i]
      )
    print(p)
    cat("\n")
  }
}

Missing values {.tabset}

p_all <-
  plot_overview_combo_static(
    agg_fields = agg_fields,
    aggregation_function = "missing_n",
    lineplot_field_name = "[ALL_FIELDS_COMBINED]",
    title = paste("Total missing values per", params$aggregated_data$aggregation_timeunit
    )
  )

if (!stratify) {
  print(p_all)
  cat("\n")
} else {
  # first tab covers entire dataset
  cat("\n####", "All\n")
  print(p_all)
  cat("\n")
  # following tabs show individual strata
  for (i in seq_along(strata_labels)) {
    cat("\n####", strata_labels[i], "\n")
    p <-
      plot_overview_combo_static(
        agg_fields = agg_fields_strat,
        aggregation_function = "missing_n",
        lineplot_field_name = "[ALL_FIELDS_COMBINED]",
        title = paste("Total missing values per", params$aggregated_data$aggregation_timeunit),
        stratum = strata_labels[i]
      )
    print(p)
    cat("\n")
  }
}

Non-conformant values {.tabset}

p_all <-
  plot_overview_combo_static(
    agg_fields = agg_fields,
    aggregation_function = "nonconformant_n",
    lineplot_field_name = "[ALL_FIELDS_COMBINED]",
    title = paste("Total nonconformant values per", params$aggregated_data$aggregation_timeunit)
  )

if (!stratify) {
  print(p_all)
  cat("\n")
} else {
  # first tab covers entire dataset
  cat("\n####", "All\n")
  print(p_all)
  cat("\n")

  # following tabs show individual strata
  for (i in seq_along(strata_labels)) {
    cat("\n####", strata_labels[i], "\n")
    p <-
      plot_overview_combo_static(
        agg_fields = agg_fields_strat,
        aggregation_function = "nonconformant_n",
        lineplot_field_name = "[ALL_FIELDS_COMBINED]",
        title = paste("Total nonconformant values per", params$aggregated_data$aggregation_timeunit),
        stratum = strata_labels[i]
      )
    print(p)
    cat("\n")
  }
}

Duplicate records

plot_overview_totals_static(
  agg_field = agg_fields[["[DUPLICATES]"]],
  aggregation_function = "sum",
  title = paste(
    "Total duplicate records per",
    params$aggregated_data$aggregation_timeunit
  )
)

Summary

aggsummarydf <- data.frame(
  "Item" = c(
    "Column used for timepoint",
    "Min timepoint value",
    "Max timepoint value",
    "Timepoint aggregation unit",
    "Total number of timepoints",
    "Number of empty timepoints",
    "Number of data fields"
  ),
  "Value" = c(
    aggsummary$overall["timepoint_field_name"],
    aggsummary$overall["timepoint_min"],
    aggsummary$overall["timepoint_max"],
    aggsummary$overall["aggregation_timeunit"],
    aggsummary$overall["n_timepoints"],
    aggsummary$overall["n_empty_timepoints"],
    aggsummary$overall["n_fields"]
  ),
  stringsAsFactors = FALSE
)

reactable::reactable(
  aggsummarydf,
  sortable = FALSE,
  filterable = FALSE,
  searchable = FALSE,
  pagination = FALSE,
  rownames = FALSE,
  striped = TRUE,
  compact = TRUE,
  fullWidth = FALSE,
  columns = list(
    Item = reactable::colDef(
      name = "",
      style = list(fontWeight = "bold"),
      minWidth = 300
    ),
    Value = reactable::colDef(
      name = "",
      minWidth = 200
    )
  )
)

Individual data fields {.tabset}

# NOTE: can't modify chunk options within a chunk to get different fig heights
# (https://github.com/yihui/knitr/issues/841) so have to make plots consistent for all tabs
individual_fields_fig_height <- knitr::opts_chunk$get("fig.height")

if (stratify) {
  individual_fields_fig_height <- 1 + length(strata_labels)*0.8
}

for (i in seq_along(names(agg_fields))) {
  agg_field <- agg_fields[[i]]
  field_name <- names(agg_fields)[i]

  if (stratify) {
    if (field_name != params$source_data$strata_field_name) {
      cat("\n###", field_name, " {.tabset}\n")
      aggregation_functions <- agg_field$function_list
      for (aggregation_function in aggregation_functions) {
        cat("\n####", agg_fun_friendly_name(aggregation_function, "short"), "\n")
        # get corresponding agg_field_strat object
        agg_field_strat <- agg_fields_strat[[field_name]]
        p <-
          plot_stratified_combo_static(
            agg_field = agg_field,
            agg_field_strat = agg_field_strat,
            aggregation_function = aggregation_function)
        print(p)
        cat("\n")
      }
    }
  } else {
    cat("\n###", field_name, " {.tabset}\n")
    aggregation_functions <- agg_field$function_list
    for (aggregation_function in aggregation_functions) {
      if (startsWith(aggregation_function, "subcat")) {
        cat("\n####", agg_fun_friendly_name(aggregation_function, "short"), " {.tabset}\n")
        # first tab has all subcats in heatmap
        cat("\n#####", "All\n")
        p <-
          plot_subcat_heatmap_static(
            agg_field = agg_field,
            aggregation_function = aggregation_function
          )
        print(p)
        cat("\n")
        # following tabs show individual time series
        agg_cols <- names(agg_field$values)
        for (subcat_col in agg_cols[which(startsWith(agg_cols, aggregation_function))]) {
          cat("\n#####", agg_fun_subcat_value(subcat_col), "\n")
          p <-
            plot_timeseries_static(
              agg_field = agg_field,
              agg_fun_colname = subcat_col
            )
          print(p)
          cat("\n")
        }
      } else{
        cat("\n####", agg_fun_friendly_name(aggregation_function, "short"), "\n")
        p <-
          plot_timeseries_static(
            agg_field = agg_field,
            agg_fun_colname = aggregation_function
          )
        print(p)
        cat("\n")
      }
    }
  }
}

phuongquan/daiquiri documentation built on April 5, 2024, 10:36 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

phuongquan/daiquiri
Data Quality Reporting for Temporal Datasets

In phuongquan/daiquiri: Data Quality Reporting for Temporal Datasets

{.tabset .tabset-pills}

Source data {.tabset}

Data fields imported

Data fields ignored

Validation warnings

Summary

Aggregated data {.tabset}

Values present {.tabset}

Missing values {.tabset}

Non-conformant values {.tabset}

Duplicate records

Summary

Individual data fields {.tabset}

R Package Documentation

Browse R Packages

We want your feedback!

phuongquan/daiquiri Data Quality Reporting for Temporal Datasets

In phuongquan/daiquiri: Data Quality Reporting for Temporal Datasets

{.tabset .tabset-pills}

Source data {.tabset}

Data fields imported

Data fields ignored

Validation warnings

Summary

Aggregated data {.tabset}

Values present {.tabset}

Missing values {.tabset}

Non-conformant values {.tabset}

Duplicate records

Summary

Individual data fields {.tabset}

R Package Documentation

Browse R Packages

We want your feedback!

phuongquan/daiquiri
Data Quality Reporting for Temporal Datasets