StataDataSetDao <- R6::R6Class("StataDataSetDao", # nolint
private = list(
original_data_set = NULL,
variable_names = NULL,
variable_labels = NULL,
value_label_container = NULL,
languages = NULL,
missing_conditions = NULL,
label_table = NULL,
variables_no_distribution = NULL,
extract_languages = function(ds) {
# available languages in data set
private$languages <- readstata13::get.lang(ds, print = FALSE)
private$languages <- lapply(private$languages, tolower)
# languages in some stata data sets are not correctly specified
lang_try_en <- try(readstata13::get.label.name(ds, colnames(ds[1]), "en"),
silent = TRUE
)
if (class(lang_try_en) == "try-error") {
private$languages[["languages"]] <- "de"
} else {
lang_try_de <- try(readstata13::get.label.name(ds, colnames(ds[1]),
"de"), silent = TRUE)
if (class(lang_try_de) == "try-error") {
private$languages[["languages"]] <- "en"
}
}
assertthat::assert_that("de" %in% private$languages[["languages"]] ||
"en" %in% private$languages[["languages"]],
msg = "Data Set must contain language 'de' or 'en'!")
},
extract_variable_labels = function(ds) {
# get variable labels of variables in stata data set
private$variable_labels <- list()
if ("de" %in% private$languages[["languages"]]) {
varlabel_de <- readstata13::varlabel(ds, lang = "de")
if (!is.null(varlabel_de)) {
private$variable_labels[["de"]] <- varlabel_de
} else {
private$variable_labels[["de"]] <- NULL
}
} else {
private$variable_labels[["de"]] <- NULL
}
if ("en" %in% private$languages[["languages"]]) {
private$variable_labels[["en"]] <- readstata13::varlabel(ds,
lang = "en")
varlabel_en <- readstata13::varlabel(ds, lang = "en")
if (!is.null(varlabel_en)) {
private$variable_labels[["en"]] <- varlabel_en
} else {
private$variable_labels[["en"]] <- NULL
}
} else {
private$variable_labels[["en"]] <- NULL
}
},
# get value label container of variables in stata data set
extract_value_label_container = function(ds) {
private$value_label_container <- list()
if (private$languages[["default"]] == "de" &&
"en" %in% private$languages[["languages"]]) {
private$value_label_container[["de"]] <- setNames(
names(attr(ds, "val.labels")),
attr(ds, "names")
)
private$value_label_container[["en"]] <- readstata13::get.label.name(ds,
NULL, "en")
}
else if (private$languages[["default"]] == "de"
&& !("en" %in% private$languages[["languages"]])) {
private$value_label_container[["de"]] <- setNames(names(attr(ds,
"val.labels")), attr(ds, "names"))
private$value_label_container[["en"]] <- NULL
}
else if (private$languages[["default"]] == "en" && "de" %in%
private$languages[["languages"]]) {
private$value_label_container[["en"]] <- setNames(names(attr(ds,
"val.labels")), attr(ds, "names"))
private$value_label_container[["de"]] <- readstata13::get.label.name(ds,
NULL, "de")
}
else if (private$languages[["default"]] == "en" && (!"de" %in%
private$languages[["languages"]])) {
private$value_label_container[["en"]] <- setNames(names(attr(ds,
"val.labels")), attr(ds, "names"))
private$value_label_container[["de"]] <- NULL
}
},
evaluate_missing_conditions = function(values, conditions) {
temp <- data.frame(matrix(nrow = length(conditions),
ncol = length(values)))
for (i in 1:length(conditions)) {
temp[i, ] <- eval(parse(text = conditions[i]))
}
out <- unname(apply(temp, MARGIN = 2, any))
return(out)
},
is_valid_variable_name = function(variable_name) {
assertthat::assert_that(variable_name %in% private$variable_names,
msg = paste0("Data Set does not contain variable '",
variable_name, "'!"))
}
),
public = list(
initialize = function(data_set_location,
missing_conditions_string =
c("values %in% \"-966 nicht bestimmbar\"",
"values %in% \"-968 unplausibler Wert\"",
"values %in% \"-995 keine Teilnahme (Panel)\"",
"values %in% \"-998 keine Angabe\"",
"values %in% \"-989 filterbedingt fehlend\""),
missing_conditions_date = "is.na(values)",
missing_conditions_numeric = "values <= -800", #nolint
variables_no_distribution = c("pid", "id")) { #nolint
private$missing_conditions = list() #nolint
private$missing_conditions[["string"]] = missing_conditions_string #nolint
private$missing_conditions[["numeric"]] = missing_conditions_numeric #nolint
private$missing_conditions[["date"]] = missing_conditions_date #nolint
cat(paste0("Read stata file \"", data_set_location, "\n"))
private$original_data_set <- readstata13::read.dta13(data_set_location,
convert.factors = FALSE
)
private$variable_names <- colnames(private$original_data_set)
private$extract_languages(private$original_data_set)
private$extract_variable_labels(private$original_data_set)
private$extract_value_label_container(private$original_data_set)
private$label_table <- attr(private$original_data_set, "label.table")
private$variables_no_distribution <- variables_no_distribution
},
get_storage_type = function(variable_name) {
if (private$is_valid_variable_name(variable_name)) {
return(typeof(private$original_data_set[[variable_name]]))
}
},
get_data_type = function(variable_name) {
if (private$is_valid_variable_name(variable_name)) {
data_type <- I18nString$new()
data_type$set_de(
dplyr::case_when(
is.character(private$original_data_set[[variable_name]]) ~ "string", #nolint
is.numeric(private$original_data_set[[variable_name]]) ~ "numerisch", #nolint
lubridate::is.timepoint(private$original_data_set[[variable_name]]) ~ "datum" #nolint
)
)
data_type$set_en(
dplyr::case_when(
is.character(private$original_data_set[[variable_name]]) ~ "string", #nolint
is.numeric(private$original_data_set[[variable_name]]) ~ "numeric", #nolint
lubridate::is.timepoint(private$original_data_set[[variable_name]]) ~ "date" #nolint
)
)
return(data_type)
}
},
get_variable_label = function(variable_name) {
if (private$is_valid_variable_name(variable_name)) {
variable_label <- I18nString$new()
variable_label$set_de(private$variable_labels[["de"]][[variable_name]])
variable_label$set_en(private$variable_labels[["en"]][[variable_name]])
return(variable_label)
}
},
get_distribution = function(variable_name, scale_level_en, access_ways) {
if (!("not-accessible" %in% access_ways) &&
!(variable_name %in% private$variables_no_distribution)) {
if (private$is_valid_variable_name(variable_name)) {
data_type_en <- self$get_data_type(variable_name)$get_en()
original_values <- private$original_data_set[[variable_name]]
if (data_type_en %in% c("numeric", "string")) {
assertthat::assert_that(!(NA %in% original_values),
msg = paste0("Found NA in variable '",
variable_name, "'! Does it contain STATA-Systemmissings?"))
if (data_type_en == "string") {
assertthat::assert_that(all(nzchar(original_values)),
msg = paste0("Found empty string in variable '",
variable_name, "'! Does it contain STATA-Systemmissings?"))
}
}
de <- ""
en <- ""
if ("de" %in% private$languages[["languages"]]) {
de <- private$label_table[[private$value_label_container$de[variable_name]]] #nolint
}
if ("en" %in% private$languages[["languages"]]) {
en <- private$label_table[[private$value_label_container$en[variable_name]]] #nolint
}
value_labels <- list(
de = de,
en = en
)
if (data_type_en == "date") {
original_values <- lubridate::date(original_values)
}
valid_values <- original_values[!(
private$evaluate_missing_conditions(original_values,
private$missing_conditions[[data_type_en]])
)]
missing_values <- original_values[
private$evaluate_missing_conditions(original_values,
private$missing_conditions[[data_type_en]])
]
# attach S3 class attributes to data set column for further
# calculations
assertthat::assert_that(assertthat::noNA(scale_level_en),
msg = paste0("No scale level found for variable '",
variable_name, "'!"))
assertthat::assert_that(assertthat::noNA(data_type_en),
msg = paste0("No data type found for variable '",
variable_name, "'!"))
class(original_values) <- c(class(original_values), data_type_en,
scale_level_en)
class(valid_values) <- c(class(valid_values), data_type_en,
scale_level_en)
distribution <- calculate_frequencies(original_values, valid_values)
distribution$set_valid_responses(calculate_valid_responses(
original_values, valid_values, value_labels
))
distribution$set_missings(calculate_missings(
original_values, missing_values, value_labels
))
distribution$set_statistics(calculate_statistics(valid_values))
return(distribution)
}
}
},
get_index_in_data_set = function(variable_name) {
if (private$is_valid_variable_name(variable_name)) {
return(which(private$variable_names == variable_name) - 1)
}
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.