DQAstats2SHIPdataset.R
In DQAstats: Core Functions for Data Quality Assessment

## ----include = FALSE----------------------------------------------------------
# nolint start
library(magrittr)


## -----------------------------------------------------------------------------
# the next lines are commentend, since we avoided to have 'dataquieR' as yet
# another dependency to our package; however, if you have installed it, you
# can uncomment these lines to load the required files from the dataquieR-package

#list.files(system.file("extdata", package = "dataquieR"))

# load the ship dataset
# ship_data <- readRDS(
#   system.file("extdata/ship.RDS", package = "dataquieR")
# ) %>%
#   data.table::data.table()
ship_url <-
  "https://dataquality.qihs.uni-greifswald.de/extdata/ship.RDS"
tmpf <- tempfile()
download.file(ship_url, tmpf, mode = "wb")
ship_data <- readRDS(tmpf) %>%
  data.table::data.table()

# export the dataset to a CSV-file (one of the formats supported by DQAstats)
ship_data_export_fn <- "ship_data.csv"
data.table::fwrite(
  x = ship_data,
  file = file.path(tempdir(), ship_data_export_fn)
)


## -----------------------------------------------------------------------------
# load the ship metadata
# ship_meta <- readRDS(
#   system.file("extdata/ship_meta.RDS", package = "dataquieR")
# ) %>%
#   data.table::data.table()
ship_meta_url <-
  "https://dataquality.qihs.uni-greifswald.de/extdata/ship_meta.RDS"
tmpf <- tempfile()
download.file(ship_meta_url, tmpf, mode = "wb")
ship_meta <- readRDS(tmpf) %>%
  data.table::data.table()


## ----results='asis'-----------------------------------------------------------
ship_meta %>%
  DT::datatable(options = list(
    scrollX = TRUE,
    pageLength = 4
  ))


## -----------------------------------------------------------------------------
mdr <- data.table::fread(
  file = system.file(
    "demo_data/utilities/MDR/mdr_example_data.csv",
    package = "DQAstats"
  ),
  header = TRUE,
  nrows = 0,
  colClasses = "character"
)
dim(mdr)
#> [1]  0 15
colnames(mdr)
#>  [1] "designation"           "source_variable_name"  "filter"                "source_table_name"     "source_system_name"    "source_system_type"   
#>  [7] "key"                   "variable_name"         "variable_type"         "restricting_date_var"  "constraints"           "dqa_assessment"       
#> [13] "definition"            "data_map"              "plausibility_relation"



## ----warning=FALSE------------------------------------------------------------
# get names of dataelements from ship dataset
ship_var_names <- intersect(colnames(ship_data), ship_meta[, get("VAR_NAMES")])

# loop over dataelements
for (var in ship_var_names) {
  # get variable type from ship metadata
  data_type <- ship_meta[get("VAR_NAMES") == var, get("DATA_TYPE")]
  # get variable description from ship metadata
  labels <- ship_meta[get("VAR_NAMES") == var, get("VALUE_LABELS")]
  split_list <- strsplit(
    x = labels,
    split = " | ",
    fixed = TRUE
  )
  # convert variable description into human readable format
  if (length(split_list[[1]]) == 1 && is.na(split_list[[1]])) {
    descr <- paste0("Description for dataelement '", var, "'")
  } else {
    descr <- paste0(
      "Description for dataelement '", var,
      "': \n\n", paste0(unlist(split_list), collapse = "; ")
    )
  }
  
  # for this dataelement, fill row of DQAstats-MDR with extracted information
  mdr <- data.table::rbindlist(
    l = list(
      mdr,
      data.table::as.data.table(
        x = cbind(
          "designation" = var,
          "definition" = descr,
          "source_variable_name" = var,
          "variable_name" = var,
          "key" = var,
          "source_table_name" = ship_data_export_fn,
          "source_system_type" = "csv", # indicates that datasets format is CSV
          "source_system_name" = "ship", # arbitrary name for this dataset
          "dqa_assessment" = 1, # default value for dataelements to be analyzed
          "variable_type" = data_type
        )
      )
    ),
    fill = TRUE
  )
}


## -----------------------------------------------------------------------------
# save all categorical variablees in a vector
cat_vars <- c(
  "id", "sex", "obs_bp", "dev_bp", "obs_soma", "dev_length", "dev_weight",
  "obs_int", "school", "family", "smoking", "stroke", "myocard",
  "diab_known", "contraception", "income"
)

# change variable type from integer to "enumerated" (to get meaningful
# results from DQAstats)
mdr[get("designation") %in% cat_vars, ("variable_type") := "enumerated"]


## ----warning=FALSE------------------------------------------------------------
# loop over categorical variables
for (var in cat_vars) {
  # get definitions of category-levels
  labels <- ship_meta[get("VAR_NAMES") == var, get("VALUE_LABELS")]
  split_list <- strsplit(
    x = labels,
    split = " | ",
    fixed = TRUE
  )
  
  # extract the allowed values for each categorical dataelement
  allowed_values <- strsplit(
    x = split_list[[1]],
    split = " =",
    fixed = TRUE
  )
  
  # transform allowed values into a "value_set" array, which is stored
  # as a JSON object (required by DQAstats)
  c <- jsonlite::toJSON(
    list(
      "value_set" = sapply(
          X = allowed_values,
          FUN = function(x) {
            return(x[[1]])
          })
    ),
    pretty = TRUE
  )
  # add constraints for this dataelement to the DQAstats MDR
  mdr[get("source_variable_name") == var, ("constraints") := c]
}


## -----------------------------------------------------------------------------
# for comparison, show metadata definition from dataquieR ...
# dataquieR: string, split by pipes '|'
ship_meta[get("VAR_NAMES") == "smoking", get("VALUE_LABELS")]
#> [1] "0 = nonsmoker | 1 = former smoker | 2 = smoker"


## -----------------------------------------------------------------------------
# ... and DQAstats: JSON
mdr[get("source_variable_name") == "smoking", get("constraints")]
#> [1] "{\n  \"value_set\": [\"0\", \"1\", \"2\"]\n}"
mdr[get("source_variable_name") == "smoking", get("constraints")] %>%
  jsonlite::fromJSON()
#> $value_set
#> [1] "0" "1" "2"


## -----------------------------------------------------------------------------
# get names of continuous variables (float/integer)
cont_vars <- mdr[
  get("variable_type") %in% c("integer", "float"),
  unique(get("source_variable_name"))
]

# loop over continuous dataelements
for (var in cont_vars) {
  # extract constraints of continuous dataelement
  labels <- ship_meta[get("VAR_NAMES") == var, get("HARD_LIMITS")]
  split_list <- strsplit(
    x = labels,
    split = ";",
    fixed = TRUE
  )

  # transform these constraints to a list, with entries "min" and "max"
  constraints <- lapply(
    X = split_list,
    FUN = function(x) {

      if (!is.na(x[[1]])) {
        return(
          list(
            "min" = as.numeric(gsub("[[:punct:]]", "", x[[1]])),
            "max" = as.numeric(gsub("[[:punct:]]", "", x[[2]]))
          )
        )
      } else {
        return("error")
      }
    }
  )

  # save list as nested JSON within the key "range" (required by DQAstats)
  if (length(constraints[[1]]) > 1) {
    c <- jsonlite::toJSON(
      list("range" = constraints[[1]]),
      pretty = TRUE,
      auto_unbox = TRUE
    )
    # write constraint for this dataelement to DQAstats MDR
    mdr[get("source_variable_name") == var, ("constraints") := c]
  }
}


## -----------------------------------------------------------------------------
# The `id` variable should actually not be treated as a categorical variable,
# since the number of potential categories is (theoretically) unlimited and
# an analysis of the categories in this case is seldom meaningful.
# Nevertheless, we could perform quality checks on the `id` variable, e.g. to
# check for a required formatting of the values.
# This can be achieved with DQAstats by changing the variable type to "string"
# and use a regular expression that checks for the expected format.

# change variable type to "string"
mdr[get("source_variable_name") == "id", ("variable_type") := "string"]

# add regular expression for the dataelement "id"
mdr[get("source_variable_name") == "id", ("constraints") := jsonlite::toJSON(
  list("regex" = "^[[:digit:]]{1,5}$"),
  pretty = TRUE,
  auto_unbox = TRUE
)]


## -----------------------------------------------------------------------------
# add plausibilities
# sex and contraception
p <- jsonlite::toJSON(list(
  "atemporal" = list(
    "contraception" = list(
      "name" = "A_present_and_B_vv",
      "description" = "Contracept in males.",
      "join_crit" = "id",
      "filter" = list(
        "ship" = "^1$"
      ),
      "constraints" = list(
        "value_set" = list(
          "ship" = "1"
        )
      )
    )
  )
),
  pretty = TRUE,
  auto_unbox = TRUE
)

# write plausibility to mdr
mdr[
  source_variable_name == "sex" &
    dqa_assessment == 1,
  plausibility_relation := p
]


## -----------------------------------------------------------------------------
p <- jsonlite::toJSON(list(
  "atemporal" = list(
    "diab_age" = list(
      "name" = "A_present_and_B_levels_vl",
      "description" = "Diab age but no diab.",
      "join_crit" = "id",
      "filter" = list(
        "ship" = "^(?!.*(99900|99901|99801)).*$"
      ),
      "constraints" = list(
        "value_set" = list(
          "ship" = "1"
        )
      )
    )
  )),
  pretty = TRUE,
  auto_unbox = TRUE
)
mdr[
  source_variable_name == "diab_known" &
    dqa_assessment == 1,
  plausibility_relation := p
]


## -----------------------------------------------------------------------------
p <- jsonlite::toJSON(list(
  "uniqueness" = list(
    "sex" = list(
      "name" =  "Every ID is associated with one Sex",
      "description" = paste0("With each distinct value of 'id', ",
      "only one value of 'sex' may be associated."
    )
  ))),
  pretty = TRUE,
  auto_unbox = TRUE
)
mdr[
  source_variable_name == "id" &
    dqa_assessment == 1,
  plausibility_relation := p
]


## ----results='asis'-----------------------------------------------------------
mdr %>%
  DT::datatable(options = list(
    scrollX = TRUE,
    pageLength = 4
  ))


## ----message=FALSE------------------------------------------------------------
utils_path <- file.path(tempdir(), "utilities")
dir.create(file.path(utils_path, "MDR"), recursive = TRUE)

data.table::fwrite(
  x = mdr,
  file = file.path(utils_path, "MDR/mdr.csv")
)


## ----results='hide'-----------------------------------------------------------
file.copy(
  from = system.file("demo_data/utilities/RMD", package = "DQAstats"),
  to = utils_path,
  overwrite = TRUE,
  recursive = TRUE
)


## ----eval=FALSE, warning=FALSE, message=FALSE---------------------------------
# # ship data set
# source_system_name <- "ship"
# target_system_name <- source_system_name
# 
# mdr_filename <- "mdr.csv"
# output_dir <- file.path(tempdir(), "output")
# logfile_dir = tempdir()
# 
# # does only work, if "ship_data.csv" is lying next to this RMD-file
# Sys.setenv("SHIP_PATH" = tempdir())
# 
# # provide all arguments to main function
# all_results <- DQAstats::dqa(
#   source_system_name = source_system_name,
#   target_system_name = target_system_name,
#   utils_path = utils_path,
#   mdr_filename = mdr_filename,
#   output_dir = output_dir,
#   logfile_dir = logfile_dir
# )


## ----eval=FALSE---------------------------------------------------------------
# library(DQAgui)
# # set basepath for file-browser in GUI-config
# Sys.setenv("CSV_SOURCE_BASEPATH" = tempdir())
# 
# ## launch GUI
# DQAgui::launch_app(
#   utils_path = utils_path,
#   mdr_filename = "mdr.csv",
#   parallel = FALSE
# )
# # nolint end
Any scripts or data that you put into this service are public.
DQAstats documentation built on April 12, 2025, 2:21 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
DQAstats
Core Functions for Data Quality Assessment

inst/doc/DQAstats2SHIPdataset.R
In DQAstats: Core Functions for Data Quality Assessment

Try the DQAstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DQAstats Core Functions for Data Quality Assessment

inst/doc/DQAstats2SHIPdataset.R In DQAstats: Core Functions for Data Quality Assessment

Try the DQAstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DQAstats
Core Functions for Data Quality Assessment

inst/doc/DQAstats2SHIPdataset.R
In DQAstats: Core Functions for Data Quality Assessment