Nothing
#' @title
#' Transform single-row category information to multiple rows as element
#'
#' @description
#' Expands data dictionary column(s) in a element (the parameter 'from'),
#' into another element (the parameter 'to').
#' If the element `from` contains any column starting with 'prefix', (xx,yy),
#' these columns will be added as 'xx' and 'yy' in the element identified by
#' `to`. This data frame will be created if necessary, and columns will be
#' added, from left to right. (unique names will be generated if necessary).
#' Separator of each element is the following structure :
#' 'name = xx1 ; name = xx2'.
#' This function is mainly used to expand the column(s) 'Categories::xx' in
#' "Variables" to "Categories" element with column(s) xx.
#' This function is the reversed operation of [data_dict_collapse()]
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [data_dict_collapse()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' transformed.
#' @param from A symbol identifying the name of the element (data frame) to take
#' column(s) from. Default is 'Variables'.
#' @param to A symbol identifying the name of the element (data frame) to create
#' column(s) to. Default is 'Categories'.
#' @param name_prefix Character string of the prefix of columns of interest.
#' This prefix will be used to select columns, and to rename them in the 'to'
#' element. Default is 'Categories::'.
#'
#' @returns
#' A list of data frame(s) identifying a data dictionary.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$`data_dict_PARIS - collapsed`
#' data_dict_expand(data_dict)
#'
#' }
#'
#' @import dplyr tidyr stringr fabR
#' @importFrom crayon bold
#' @importFrom rlang .data
#'
#' @export
data_dict_expand <- function(
data_dict,
from = 'Variables',
name_prefix = 'Categories::',
to = 'Categories'){
# test
as_data_dict_shape(data_dict)
from <- substitute(from)
if(typeof(from) == "character") from <- as.symbol(from)
if(typeof(from) == "symbol") from <- substitute(from)
if(typeof(from) == "language") from <- as.symbol(from)
to <- substitute(to)
if(typeof(to) == "character") to <- as.symbol(to)
if(typeof(to) == "symbol") to <- substitute(to)
if(typeof(to) == "language") to <- as.symbol(to)
if(is.null(data_dict[[from]])){
stop(call. = FALSE,
"Your data dictionary contains no '",from,"' element.")}
names_col <-
data_dict[[from]] %>%
select(starts_with(name_prefix), - any_of(name_prefix)) %>% names
if(length(names_col) == 0){
warning(
"Your data dictionary contains no column starting with '",
name_prefix,"' in ",from)
return(data_dict)}
rename_col <- make.unique(str_remove(names_col,name_prefix))
data_dict[[to]] <-
tibble(name = as.character(),variable = as.character()) %>%
bind_rows(data_dict[[to]])
for(i in names_col){
# stop()}
tryCatch(
{to_temp <-
data_dict[[from]] %>%
select(variable = "name", col_to = !! i ) %>%
dplyr::filter(!is.na(.data$`col_to`)) %>%
mutate(
col_to = ifelse(str_detect(.data$`col_to`, "_="),
str_replace_all(.data$`col_to`, "_=", "__SEP_IN__"),
.data$`col_to`),
col_to = ifelse(str_detect(.data$`col_to`, "_;"),
str_replace_all(.data$`col_to`, "_;", "__SEP_OUT__"),
.data$`col_to`),
col_to = ifelse(str_detect(.data$`col_to`, "__SEP_IN__"),
.data$`col_to`,
str_replace_all(.data$`col_to`, "=", "__SEP_IN__")),
col_to = ifelse(str_detect(.data$`col_to`, "__SEP_OUT__"),
.data$`col_to`,
str_replace_all(.data$`col_to`, ";", "__SEP_OUT__"))
) %>%
separate_rows("col_to", sep="__SEP_OUT__") %>%
separate(.data$`col_to`, into = c("name",i), sep = "__SEP_IN__") %>%
mutate_all(~ str_squish(.)) %>%
rename_with(.cols = !! i ,.fn = ~ rename_col[which(names_col == i)])
data_dict[[to]] <-
data_dict[[to]] %>%
full_join(to_temp,by = c("name","variable"))}
,
warning=function(w) {
# Choose a return value in case of warning
error_vars <-
silently_run({(
data_dict[[from]] %>%
select(variable = "name", col_to = !! i ) %>%
dplyr::filter(!is.na(.data$`col_to`)) %>%
mutate(
col_to = ifelse(str_detect(.data$`col_to`, "_="),
str_replace_all(
.data$`col_to`, "_=", "__SEP_IN__"),
.data$`col_to`),
col_to = ifelse(str_detect(.data$`col_to`, "_;"),
str_replace_all(
.data$`col_to`, "_;", "__SEP_OUT__"),
.data$`col_to`),
col_to = ifelse(str_detect(.data$`col_to`, "__SEP_IN__"),
.data$`col_to`,
str_replace_all(
.data$`col_to`, "=", "__SEP_IN__")),
col_to = ifelse(str_detect(.data$`col_to`, "__SEP_OUT__"),
.data$`col_to`,
str_replace_all(
.data$`col_to`, ";", "__SEP_OUT__"))) %>%
separate_rows("col_to", sep="__SEP_OUT__") %>%
separate(.data$`col_to`,
into = c("name", i),
sep = "__SEP_IN__") %>%
dplyr::filter(is.na(!! i)) %>%
pull(.data$`variable`) %>% toString)
})
stop(call. = FALSE,
"\n\nParsing elements failures in your data dictionary.",
"\nVariables affected:\n",
error_vars,"\n",
"Column affected: ",i,"\n",
bold("\n\nUseful tip:"),
" If your colums contains ',' or '=' in
its labels, replace the separators by '_;' and '_=' and reprocess.
Example:
> wrong: '0 = No alcohol ; 1 = Alcohol(red ; white)'
> good : '0 = No alcohol ",bold("_;")," 1 = Alcohol(red ; white)'\n")
})
}
data_dict[[from]] <- data_dict[[from]] %>% select(- !! names_col)
data_dict <- as_data_dict_shape(data_dict)
return(data_dict)
}
#' @title
#' Transform multi-row category column(s) to single rows and join to "Variables"
#'
#' @description
#' Collapses a data dictionary element (the parameter 'from'),
#' into column(s) in another element (the parameter 'to')
#' If the element 'to' exists, and contains any column 'xx' or 'yy', these
#' columns will be added to the element 'from' under the names 'to:xx'
#' and 'to:yy'. (unique names will be generated if necessary). Each element
#' of these column will gather all information to process the reverse operation.
#' Separator of each element is the following structure :
#' 'name = xx1 ; name = xx2'.
#' This function is mainly used to collapse the 'Categories' element into
#' columns in 'Variables'.
#' This function is the reversed operation of [data_dict_expand()]
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [data_dict_expand()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' transformed.
#' @param from A symbol identifying the name of the element (data frame) to take
#' column(s) from. Default is 'Categories'.
#' @param to A symbol identifying the name of the element (data frame) to create
#' column(s) to. Default is 'Variables'.
#' @param name_prefix A character string of the prefix of columns of interest.
#' This prefix will be used to select columns, and to rename them in the 'to'
#' element. Default is 'Categories::'.
#'
#' @returns
#' A list of data frame(s) identifying a data dictionary.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$data_dict_MELBOURNE
#' data_dict_collapse(data_dict)
#'
#' }
#'
#' @import dplyr tidyr stringr
#' @importFrom rlang .data
#'
#' @export
data_dict_collapse <- function(
data_dict,
from = 'Categories',
to = 'Variables',
name_prefix = 'Categories::'){
# test
as_data_dict_shape(data_dict)
from <- substitute(from)
if(typeof(from) == "character") from <- as.symbol(from)
if(typeof(from) == "symbol") from <- substitute(from)
if(typeof(from) == "language") from <- as.symbol(from)
to <- substitute(to)
if(typeof(to) == "character") to <- as.symbol(to)
if(typeof(to) == "symbol") to <- substitute(to)
if(typeof(to) == "language") to <- as.symbol(to)
if(is.null(data_dict[[from]])){
warning("Your data dictionary contains no '",from,"' element.")
return(data_dict)}
if(is.null(data_dict[[to]])){
warning("Your data dictionary contains no '",to,"' element.")
return(data_dict)}
# add categories content
if(sum(nrow(data_dict[[from]])) > 0){
col_from <- tibble(name = as.character())
for(i in names(data_dict[[from]] %>% select(-c("name","variable")))){
# stop()}
cat_temp <-
data_dict[[from]] %>%
select("variable","name",!! i) %>%
unite("from", .data$`name`, !! i, sep = " __SEP_IN__ ") %>%
group_by(.data$`variable`) %>%
summarise(from = paste0(.data$`from`,collapse = " __SEP_OUT__ \n"),
.groups = "drop") %>%
mutate(
from = ifelse(str_detect(.data$`from`, ";"),
str_replace_all(.data$`from`, "__SEP_OUT__", "_;"),
.data$`from`),
from = ifelse(str_detect(.data$`from`, "="),
str_replace_all(.data$`from`, "__SEP_IN__", "_="),
.data$`from`),
from = str_replace_all(.data$`from`, "__SEP_OUT__", ";"),
from = str_replace_all(.data$`from`, "__SEP_IN__", "="))
names(cat_temp) <- c("name",paste0(name_prefix,i))
col_from <- full_join(col_from,cat_temp, by = "name")
}
data_dict[[to]] <-
data_dict[[to]] %>%
full_join(col_from, by = c("name"))
data_dict[[from]] <- NULL
}
data_dict <- as_data_dict_shape(data_dict)
return(data_dict)
}
#' @title
#' Transform column(s) of a data dictionary from long format to wide format
#'
#' @description
#' Transforms column(s) of a data dictionary from long format to wide format.
#' If a taxonomy is provided, the corresponding columns in the data
#' dictionary will be converted to a format with the taxonomy expanded.
#' This operation is equivalent to performing a [tidyr::pivot_wider()] on these
#' columns following the taxonomy structure provided. Variable names in the
#' data dictionary must be unique.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' A taxonomy is a classification schema that can be defined for variable
#' attributes. A taxonomy is usually extracted from an
#' [Opal environment](https://www.obiba.org/pages/products/opal/), and a
#' taxonomy object is a data frame that must contain at least the columns
#' `taxonomy`, `vocabulary`, and `terms`. Additional details about Opal
#' taxonomies are
#' [available online](https://opaldoc.obiba.org/en/latest/web-user-guide/administration/taxonomies.html).
#'
#' @seealso
#' [tidyr::pivot_wider()], [as_data_dict()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' transformed.
#' @param taxonomy An optional data frame identifying a variable classification
#' schema.
#'
#' @returns
#' A list of data frame(s) identifying a data dictionary.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$`data_dict_PARIS - collapsed`
#' taxonomy <- madshapR_DEMO$taxonomy_PARIS
#' data_dict_pivot_wider(data_dict, taxonomy)
#'
#' }
#'
#' @import dplyr tidyr stringr fabR
#' @importFrom rlang .data
#'
#' @export
data_dict_pivot_wider <- function(data_dict, taxonomy = NULL){
# test
as_data_dict_shape(data_dict)
if(is.null(taxonomy)) return(data_dict)
data_dict_init <- data_dict
data_dict_unique_name <-
make.unique(replace_na(data_dict[['Variables']]$`name`,"NA"))
data_dict[['Variables']]$`name` <- data_dict_unique_name
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
mutate(across(everything(),as.character))
taxonomy <- as_taxonomy(taxonomy)
taxonomy_id <-
taxonomy %>%
unite(
col = "taxonomy_id",
c("taxonomy", "vocabulary"),
na.rm = TRUE,
sep = "::",
remove = FALSE) %>%
arrange("index_taxonomy", "index_vocabulary", "index_term")
taxonomy_id <-
taxonomy_id[,
c('index_vocabulary','taxonomy_id','taxonomy','vocabulary')] %>%
distinct() %>%
mutate(
name_col = str_replace(
.data$`taxonomy_id`,
.data$`vocabulary`,
as.character(.data$`index_vocabulary`)),
name_term = paste0(.data$`name_col`,".term"))
taxonomy_id <- taxonomy_id[,c('name_col','name_term','taxonomy')]
taxonomy_id <-
taxonomy_id[
taxonomy_id$`name_col`%in% names(data_dict[["Variables"]]) &
taxonomy_id$`name_term` %in% names(data_dict[["Variables"]]),]
if(nrow(taxonomy_id) > 0){
for(i in seq_len(nrow(taxonomy_id))){
# stop()}
name_col <- taxonomy_id$`name_col`[i]
name_term <- taxonomy_id$`name_term`[i]
col_final <- taxonomy_id$`taxonomy`[i]
data_dict_colnames <-
intersect(
names(data_dict[['Variables']]),
c('name', name_col, name_term))
data_dict_temp <-
data_dict[['Variables']][,data_dict_colnames] %>%
pivot_wider(
names_from = all_of(name_col),
values_from = all_of(name_term),
names_prefix = paste0("__temp__.",col_final,"::"))
data_dict_temp <-
data_dict_temp[vapply(X = data_dict_temp,
FUN = function(x) !all(is.na(x)),
FUN.VALUE = logical(1))]
col_temp <- names(data_dict_temp)[-1]
col_final <- str_remove(col_temp,"^__temp__\\.")
if(length(col_temp)){
for(j in seq_len(length(col_temp))){
# stop()}
col_temp_j <- col_temp[j]
col_final_j <- col_final[j]
if(col_temp_j %in% names(data_dict[['Variables']])){
stop(call. = FALSE,
"Column name ",col_temp_j, " already exists in your data dictionary")}
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
full_join(
data_dict_temp[,c('name',col_temp_j)] ,
by = c("name"))
if(sum(names(data_dict[['Variables']]) %in% col_final_j) == 1){
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
unite(!! col_final_j,
!! col_final_j,
!! col_temp_j, sep = "|", na.rm = TRUE) %>%
mutate(across(!! col_final_j, ~ na_if(.,"")))
}else{
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
rename_with(.cols = any_of(col_temp_j), .fn = ~ col_final_j)
}
}
}
data_dict[['Variables']][,c(name_col,name_term)] <- NULL
}
if(paste0(attributes(taxonomy)$`madshapR::class`,"") == "taxonomy_mlstr"){
keys <-
taxonomy[!is.na(taxonomy$`vocabulary_short`),
c('vocabulary','vocabulary_short')] %>%
distinct %>%
mutate(
vocabulary_short =
paste0("Mlstr_area::",.data$`vocabulary_short`)) %>%
mutate(vocabulary = paste0("Mlstr_area::",.data$`vocabulary`))
col_area <-
names(data_dict[['Variables']][,intersect(
keys$`vocabulary_short`,
names(data_dict[['Variables']]))])
for(i in col_area){
# stop()}
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
rename_with(
.cols = any_of(i),
.fn = ~ keys[keys$`vocabulary_short` == i,][['vocabulary']])
}
taxo_scales <-
taxonomy %>%
unite(
col = "area_scale_id",
c("taxonomy_scale", "vocabulary_scale"),
na.rm = TRUE,
sep = "::",
remove = FALSE) %>%
mutate(area_scale_id = na_if(.data$`area_scale_id`, ""))
taxo_scales <-
taxo_scales[
!is.na(taxo_scales$`area_scale_id`),
c('area_scale_id','term_scale')] %>%
distinct %>%
rename("___area_scale_id___" = "area_scale_id") %>%
rename("Mlstr_area::1.scale" = "term_scale")
if(!is.null(data_dict[['Variables']][['Mlstr_area::1.scale']]) &
all(is.na(data_dict[['Variables']][['Mlstr_area::1.scale']]))){
data_dict[['Variables']][['Mlstr_area::1.scale']] <- NULL}
if(!is.null(data_dict[['Variables']][['Mlstr_area::1.scale']]) &
!all(is.na(data_dict[['Variables']][['Mlstr_area::1.scale']]))){
if(!is.null(data_dict[['Variables']][['___area_scale_id___']])){
stop(call. = FALSE,
"Column name '___area_scale_id___' already exists in your data dictionary")}
silently_run({
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
left_join(
data_dict[['Variables']][
!is.na(data_dict[['Variables']][['Mlstr_area::1.scale']]),
c('name','Mlstr_area::1.scale')] %>%
left_join(taxo_scales, by = "Mlstr_area::1.scale") %>%
pivot_wider(
names_from = "___area_scale_id___",
values_from = "Mlstr_area::1.scale") ,
by = c("name") )
data_dict[['Variables']]['Mlstr_area::1.scale'] <- NULL
})
if(!is.null(data_dict[['Variables']][['NA']])){
warning(toString(unique(
data_dict[['Variables']][['NA']][!is.na(data_dict[['Variables']][['NA']])])),
" scale(s) not in your taxonomy but present in your data dictionary")}
}
}
}
# verification of the taxonomy, terms and vocabularies
new_names <-
names(data_dict[['Variables']])[
!names(data_dict[['Variables']]) %in%
names(data_dict_init[['Variables']])]
authorized_names <-
taxonomy %>%
select("taxonomy", "vocabulary") %>% distinct %>%
unite(col = "taxonomy_id", c("taxonomy", "vocabulary"),
na.rm = TRUE, sep = "::", remove = FALSE) %>%
pull(.data$`taxonomy_id`)
if(paste0(attributes(taxonomy)$`madshapR::class`,"") == "taxonomy_mlstr"){
authorized_names <-
c(authorized_names ,
taxonomy %>%
select("taxonomy_scale", "vocabulary_scale") %>% distinct %>%
unite(col = "area_scale_id",
c("taxonomy_scale", "vocabulary_scale"),
na.rm = TRUE, sep = "::", remove = FALSE) %>%
dplyr::filter(.data$`area_scale_id` != "") %>%
pull(.data$`area_scale_id`))
}
wrong_names <- new_names[! new_names %in% authorized_names]
if(length(wrong_names) > 0){
warning(wrong_names %>% toString,
" column name(s) not in your taxonomy but present in your data dictionary")}
data_dict[['Variables']]$`name` <- data_dict_init[['Variables']]$`name`
return(data_dict)
}
#' @title
#' Transform column(s) of a data dictionary from wide format to long format
#'
#' @description
#' Transforms column(s) of a data dictionary from wide format to long format.
#' If a taxonomy is provided, the corresponding columns in the data
#' dictionary will be converted to a standardized format with fewer columns.
#' This operation is equivalent to performing a [tidyr::pivot_longer()] on
#' these columns following the taxonomy structure provided. Variable names in
#' the data dictionary must be unique.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' A taxonomy is a classification schema that can be defined for variable
#' attributes. A taxonomy is usually extracted from an
#' [Opal environment](https://www.obiba.org/pages/products/opal/), and a
#' taxonomy object is a data frame that must contain at least the columns
#' `taxonomy`, `vocabulary`, and `terms`. Additional details about Opal
#' taxonomies are
#' [available online](https://opaldoc.obiba.org/en/latest/web-user-guide/administration/taxonomies.html).
#'
#' @seealso
#' [tidyr::pivot_longer()], [as_data_dict()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' transformed.
#' @param taxonomy An optional data frame identifying a variable classification
#' schema.
#'
#' @returns
#' A list of data frame(s) identifying a data dictionary.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$`data_dict_PARIS - collapsed`
#' taxonomy <- madshapR_DEMO$taxonomy_PARIS
#' data_dict_pivot_longer(data_dict,taxonomy)
#'
#' }
#'
#' @import dplyr tidyr fabR
#' @importFrom rlang .data
#'
#' @export
data_dict_pivot_longer <- function(data_dict, taxonomy = NULL){
# test
as_data_dict_shape(data_dict)
if(is.null(taxonomy)) { return(data_dict) }else{
as_taxonomy(taxonomy)
}
# make unique names for names in data dictionary
data_dict_init <- data_dict
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
add_index("madshapR::index", .force = TRUE)
data_dict[['Variables']]$`name` <-
make.unique(replace_na(data_dict[['Variables']]$`name`,"NA"))
order_taxonomy <-
taxonomy %>%
select('taxonomy') %>%
distinct() %>% pull('taxonomy')
taxonomy_id <-
taxonomy %>%
unite(
col = "taxonomy_id", c('taxonomy', 'vocabulary'),
na.rm = TRUE, sep = "::", remove = FALSE) %>%
unite(
col = "voc_term", c('vocabulary', 'term'),
na.rm = TRUE, sep = "::", remove = FALSE) %>%
arrange(.data$`index_taxonomy`,
.data$`index_vocabulary`,
.data$`index_term`) %>%
group_by(.data$`taxonomy`) %>%
group_split()
names(taxonomy_id) <- sort(order_taxonomy)
taxonomy_id <- taxonomy_id[order_taxonomy]
for(i in names(taxonomy_id)){
# stop()}
taxonomy_i <-
taxonomy_id[[i]] %>%
dplyr::filter(.data$`taxonomy_id` %in%
names(data_dict[['Variables']])) %>%
select('voc_term','taxonomy_id','index_vocabulary',
'index_term','vocabulary') %>%
distinct
if(taxonomy_i %>% nrow > 0){
try({
data_dict_temp <-
data_dict[['Variables']] %>%
select('name',matches(paste0('^',taxonomy_i$`taxonomy_id`,'$'))) %>%
# pivoting area of information
pivot_longer(
cols = starts_with(i),
names_to = i,
names_prefix = paste0(i,"::"),
values_to = "term",
values_drop_na = TRUE) %>%
unite(
col = "voc_term", c(!! i, 'term'),
na.rm = TRUE, sep = "::", remove = FALSE) %>%
left_join(taxonomy_i
,by = 'voc_term') %>%
arrange(!! i, .data$`index_vocabulary`, .data$`index_term`) %>%
mutate(
across(!! i,
~ ifelse(is.na(.data$`taxonomy_id`),NA_character_,.))) %>%
select(-matches('index_vocabulary'),
-matches('index_term'),
-matches('index_term'),
-matches('voc_term'),
-matches('taxonomy_id')) %>%
group_by(.data$`name`) %>%
distinct()
}, silent = TRUE)
group_max_size <- data_dict_temp %>% group_size() %>% max()
arrange_taxonomy <-
paste0(i,"::",rep(1:group_max_size,2) %>% sort(),c("",".term"))
try({
silently_run({
data_dict_temp <-
data_dict_temp %>%
summarise(
across(c(any_of(i), .data$`term`),
~ paste0(.,collapse = "|")),
.groups = "drop") %>%
separate(
col = i,
into = arrange_taxonomy[seq_len(length(arrange_taxonomy))%% 2==1],
sep = "\\|") %>%
separate(
col = .data$`term`,
into = arrange_taxonomy[seq_len(length(arrange_taxonomy))%% 2==0],
sep = "\\|") %>%
ungroup() %>%
select(
everything(),
-any_of(arrange_taxonomy),
any_of(arrange_taxonomy))
})
silently_run({
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
select(-matches(paste0("^",i,"::",taxonomy_i$`vocabulary`,"$"))) %>%
full_join(data_dict_temp, by = c("name"))
})
}, silent = TRUE)}
}
if(paste0(attributes(taxonomy)$`madshapR::class`,"") == "taxonomy_mlstr"){
keys <-
taxonomy %>%
select(.data$`vocabulary`, .data$`vocabulary_short`) %>%
dplyr::filter(!is.na(.data$`vocabulary_short`)) %>% distinct
col_area <-
data_dict[['Variables']] %>%
select(matches("Mlstr_area::[0-9]+$")) %>% names
for(i in col_area){
key <-
keys %>%
rename_with(
.cols = .data$`vocabulary`,
.fn = ~ paste0(i)) %>%
rename_with(
.cols = .data$`vocabulary_short`,
.fn = ~ paste0(i,".vocabulary_short"))
# re-arrange things (can do better)
if(is.null(data_dict[['Variables']][['___Mlstr_temp___']]) |
is.null(data_dict[['Variables']][['___Mlstr_temp___vocabulary']])){
silently_run({
data_dict[['Variables']] <-
data_dict[['Variables']] %>% left_join(key) %>%
rename_with(
.cols = all_of(i),
.fn = ~ "___Mlstr_temp___") %>%
rename_with(
.cols = paste0(i,".vocabulary_short"),
.fn = ~ "___Mlstr_temp___vocabulary") %>%
mutate(`___Mlstr_temp___` = .data$`___Mlstr_temp___vocabulary`) %>%
rename_with(.cols = .data$`___Mlstr_temp___`, .fn = ~ i) %>%
select(-.data$`___Mlstr_temp___vocabulary`)
})
}else{
stop(call. = FALSE,
"Your data dictionary cannot be processed into Maelstrom format.
(presence of `___Mlstr_temp___` column")
}
}
cols_scales <-
taxonomy %>%
unite("area_scale_id", .data$`taxonomy_scale`, .data$`vocabulary_scale`,
na.rm = TRUE, sep = "::", remove = FALSE) %>%
select(.data$`area_scale_id`) %>%
dplyr::filter(!is.na(.data$`area_scale_id`)) %>% distinct %>%
pull(.data$`area_scale_id`) %>%
intersect(names(data_dict[['Variables']]))
if(length(cols_scales) > 0){
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
unite(
"Mlstr_area::1.scale",all_of(cols_scales),sep=" | ",na.rm = TRUE) %>%
mutate(`Mlstr_area::1.scale` = na_if(.data$`Mlstr_area::1.scale`,""))}
arrange_taxonomy <-
paste0("Mlstr_area","::",
rep(seq_len(length(col_area)),2) %>% sort(),
c("",".term"))
# re-arrange things (can do better)
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
select(
everything(),
-starts_with("Mlstr_additional"),
-starts_with("Mlstr_area"),
starts_with("Mlstr_additional"),
matches("^Mlstr_area::1$"),
matches("^Mlstr_area::1.term$"),
matches("^Mlstr_area::1.scale$"),
matches("^Mlstr_area::2$"),
matches("^Mlstr_area::2.term$"),
matches("^Mlstr_area::3$"),
matches("^Mlstr_area::3.term$"),
everything()) %>%
rename_with(
.cols = any_of("Mlstr_additional::1.term"),
.fn = ~ "Mlstr_additional::Source") %>%
rename_with(
.cols = any_of("Mlstr_additional::2.term"),
.fn = ~ "Mlstr_additional::Target") %>%
select(
-matches("^Mlstr_additional::1$"),-matches("^Mlstr_additional::2$"))
}
if(sum(data_dict[["Variables"]][['madshapR::index']]) !=
nrow(data_dict[["Variables"]]) * (nrow(data_dict[["Variables"]]) + 1)/2){
stop(call. = FALSE,
"An error occured in data_dict_pivot_longer(). Please contact us.")
}
data_dict[["Variables"]] <-
data_dict[["Variables"]] %>% arrange(.data$`madshapR::index`) %>%
select(-"madshapR::index")
data_dict[["Variables"]]$`name` <- data_dict_init[["Variables"]]$`name`
return(data_dict)
}
#' @title
#' Subset data dictionary by row values
#'
#' @description
#' Subsets either or both the 'Variables' and 'Categories' elements of a data
#' dictionary. Rows are conserved if their values satisfy the condition.
#' This is a wrapper function analogous to [dplyr::filter()].
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [dplyr::filter()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' filtered.
#' @param filter_var Expressions that are defined in the element 'Variables' in
#' the data dictionary.
#' @param filter_cat Expressions that are defined in the element 'Categories' in
#' the data dictionary.
#' @param filter_all Expressions that are defined both in the 'Categories' and
#' 'Variables' in the data dictionary.
#'
#' @returns
#' A list of data frame(s) identifying a workable data dictionary structure.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' # Create a list of data dictionaries where the column 'table' is added to
#' # refer to the associated dataset. The object created is not a
#' # data dictionary per say, but can be used as a structure which can be
#' # shaped into a data dictionary.
#' library(dplyr)
#'
#' data_dict_list <- list(
#' data_dict_1 <- madshapR_DEMO$data_dict_TOKYO ,
#' data_dict_2 <- madshapR_DEMO$data_dict_MELBOURNE)
#' names(data_dict_list) = c("dataset_TOKYO","dataset_MELBOURNE")
#'
#' data_dict_nest <- data_dict_list_nest(data_dict_list, name_group = 'table')
#'
#' ###### Example 1 search and filter through a column in 'Variables' element
#' data_dict_filter(data_dict_nest,filter_var = "valueType == 'text'")
#'
#' ###### Example 2 search and filter through a column in 'Categories' element
#' data_dict_filter(data_dict_nest,filter_cat = "missing == TRUE")
#'
#' ###### Example 3 search and filter through* a column in 'Variables' element.
#' # The column must exist in both 'Variables' and 'Categories' and have the
#' # same meaning
#' data_dict_filter(data_dict_nest,filter_all = "table == 'dataset_TOKYO'")
#'
#' }
#'
#' @import dplyr tidyr
#' @importFrom rlang .data
#'
#' @export
data_dict_filter <- function(
data_dict,
filter_var = NULL,
filter_cat = NULL,
filter_all = NULL){
# test if enough data_dict
as_data_dict_shape(data_dict)
if(!is.null(filter_all) & (!is.null(filter_var) | !is.null(filter_cat)))
stop(call. = FALSE,"Too many argments entered")
if( is.null(filter_all) & is.null(filter_var) & is.null(filter_cat))
return(data_dict)
if(!is.null(filter_all))
filter_var <- filter_cat <- filter_all
data_dict[['Variables']] <-
eval(parse(
text = paste(
"data_dict[['Variables']] %>% dplyr::filter(",filter_var,")")))
if(!is.null(data_dict[['Categories']])){
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
dplyr::filter(.data$`variable` %in% data_dict[['Variables']]$`name`)
if(!is.null(filter_cat)){
data_dict[['Categories']] <-
eval(parse(
text=paste(
"data_dict[['Categories']] %>% dplyr::filter(",filter_cat,")")))}
}
if(sum(nrow(data_dict[['Categories']])) == 0)
data_dict[['Categories']] <- NULL
return(data_dict)
}
#' @title
#' Split grouped data dictionaries into a named list
#'
#' @description
#' Divides data dictionary element(s) into the groups defined by the query.
#' This function divides both the 'Variables' and 'Categories' elements (if
#' the group exists under the same definition in in both) into a list of
#' data dictionaries, each with the rows of the associated group and all the
#' original columns, including grouping variables. This function is analogous
#' to running [dplyr::group_by()] and [dplyr::group_split()]. Each element is
#' named using the group values. [data_dict_list_nest()] reverses the effect.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [dplyr::group_by()], [dplyr::group_split()] ,
#' [data_dict_group_by()], [data_dict_list_nest()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' transformed.
#' @param ... Column in the data dictionary to split it by. If not provided, the
#' splitting will be done on the grouping element of a grouped data dictionary.
#'
#' @returns
#' A list of data frame(s) identifying a list of workable data dictionary structure.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#' library(dplyr)
#'
#' # Create a list of data dictionaries where the column 'table' is added to
#' # refer to the associated dataset. The object created is not a
#' # data dictionary per say, but can be used as a structure which can be
#' # shaped into a data dictionary.
#'
#' data_dict_list <- list(
#' data_dict_1 <- madshapR_DEMO$data_dict_TOKYO ,
#' data_dict_2 <- madshapR_DEMO$data_dict_MELBOURNE)
#' names(data_dict_list) = c("dataset_TOKYO","dataset_MELBOURNE")
#'
#' data_dict_nest <-
#' data_dict_list_nest(data_dict_list, name_group = 'table') %>%
#' data_dict_group_by(col = "table")
#'
#' glimpse(data_dict_group_split(data_dict_nest,col = "table"))
#'
#' }
#'
#' @import dplyr tidyr
#' @importFrom rlang .data
#'
#' @export
data_dict_group_split <- function(data_dict, ...){
# test if enough data_dict
as_data_dict_shape(data_dict)
if(!is_grouped_df(data_dict[['Variables']]))
data_dict <- data_dict_group_by(data_dict, ...)
if(!is_grouped_df(data_dict[['Variables']]))
stop(call. = FALSE,
"\n\nThe data dictionary list must be grouped to be split. Please group them
using data_dict_group_by(data_dict, col)")
col <- as.symbol(names(group_keys(data_dict[['Variables']])))
group_names_var <- pull(group_keys(data_dict[['Variables']]))
if(sum(nrow(data_dict[['Categories']])) == 0){
data_dict[['Categories']] <-
tibble(col = as.character()) %>%
rename_with(.cols = "col", ~ deparse(col)) %>%
group_by(!! col)
}
names_var <- names(group_keys(data_dict[['Variables']]))[1]
names_cat <- names(group_keys(data_dict[['Categories']]))[1]
if(names_var != names_cat){
stop(call. = FALSE,
"Grouping column must be the same in 'Variables' and 'Categories'.")}
group_names_cat <- pull(group_keys(data_dict[['Categories']]))
if(!all(group_names_cat %in% group_names_var)) stop(call. = FALSE,
"\nThese data dictionaries contain group of variables in 'Categories' which
cannot be found accross the variables declared in 'Variables'.")
# if(length(group_names_var) == 1) return(data_dict)
data_dicts_var <-
data_dict[['Variables']] %>%
group_split() %>% as.list()
names(data_dicts_var) <- group_names_var
data_dicts_cat <-
data_dict[['Categories']] %>%
group_split() %>% as.list()
names(data_dicts_cat) <- group_names_cat
if(length(data_dicts_cat) == 0) data_dicts_cat <- NULL
data_dict_list <-
vector(mode = "list", length = length(group_names_var))
names(data_dict_list) <- group_names_var
for(i in group_names_var){
# stop()}
data_dict_list[[i]] <- list(Variables = NULL, Categories = NULL)
data_dict_list[[i]] <-
list(
Variables =
bind_rows(data_dict_list[[i]][['Variables']], data_dicts_var[[i]]),
Categories =
bind_rows(data_dict_list[[i]][['Categories']], data_dicts_cat[[i]]))
if(sum(nrow(data_dict_list[[i]][['Categories']])) == 0){
data_dict_list[[i]][['Categories']] <- NULL }
}
return(data_dict_list)
}
#' @title
#' Bind listed data dictionaries
#'
#' @description
#' Binds a list of data dictionaries into one data dictionary.
#' This is a wrapper function analogous to [dplyr::bind_rows()].
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [dplyr::bind_rows()]
#'
#' @param data_dict_list A list of data frame(s) representing metadata to be
#' transformed.
#' @param name_group A character string of one column in the dataset that can be
#' taken as a grouping column.
#'
#' @returns
#' A list of data frame(s) identifying a workable data dictionary structure.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#' library(dplyr)
#'
#' # Create a list of data dictionaries where the column 'table' is added to
#' # refer to the associated dataset. The object created is not a
#' # data dictionary per say, but can be used as a structure which can be
#' # shaped into a data dictionary.
#'
#' data_dict_list <- list(
#' data_dict_1 <- madshapR_DEMO$data_dict_TOKYO ,
#' data_dict_2 <- madshapR_DEMO$data_dict_MELBOURNE)
#' names(data_dict_list) = c("dataset_TOKYO","dataset_MELBOURNE")
#'
#' glimpse(data_dict_list_nest(data_dict_list, name_group = 'table'))
#'
#' }
#'
#' @import dplyr tidyr
#' @importFrom rlang .data
#'
#' @export
data_dict_list_nest <- function(data_dict_list, name_group = NULL){
# test if enough data_dict
data_dict_list %>%
lapply(as_data_dict_shape)
data_dict <- list(Variables = tibble(), Categories = tibble())
for(i in seq_len(length(data_dict_list))){
# stop()}
data_dict[['Variables']] <-
bind_rows(
data_dict[['Variables']],
data_dict_list[[i]][['Variables']] %>%
mutate(across(everything(), as.character)))
if(sum(nrow(data_dict_list[[i]][['Categories']])) > 0){
data_dict[['Categories']] <-
bind_rows(
data_dict[['Categories']],
data_dict_list[[i]][['Categories']] %>%
mutate(across(everything(), as.character)))
}
}
if(!is.null(name_group)){
if(name_group %in%
c(names(data_dict[['Variables']]),names(data_dict[['Categories']]))){
warning(
"The column '",name_group,
"' already exists in data dictionary and will not be added.")
}else{
name_group_col_var <- tibble()
name_group_col_cat <- tibble()
for(i in names(data_dict_list)){
# stop()}
name_group_col_var <-
bind_rows(
name_group_col_var,
data_dict_list[[i]][['Variables']][1] %>%
mutate(name_list_group = names(data_dict_list[i])) %>%
select(.data$`name_list_group`) %>%
rename_with(.cols = .data$`name_list_group`,~ name_group))
if(!is.null(data_dict_list[[i]][['Categories']][1])){
name_group_col_cat <-
bind_rows(
name_group_col_cat,
data_dict_list[[i]][['Categories']][1] %>%
mutate(name_list_group = names(data_dict_list[i])) %>%
select(.data$`name_list_group`) %>%
rename_with(.cols = .data$`name_list_group`,~ name_group))
}
}
data_dict[['Variables']] <-
name_group_col_var %>% bind_cols(data_dict[['Variables']])
data_dict[['Categories']] <-
name_group_col_cat %>% bind_cols(data_dict[['Categories']])
}
}
if(sum(nrow(data_dict[['Categories']])) == 0){
data_dict[['Categories']] <- NULL
}
return(data_dict)
}
#' @title
#' Group listed data dictionaries by specified column names
#'
#' @description
#' Groups the data dictionary element(s) by the groups defined by the query.
#' This function groups both the 'Variables' and 'Categories' elements (if
#' the group exists under the same definition in in both). This function is
#' analogous to running [dplyr::group_by()]. Each element is named using the
#' group values. [data_dict_ungroup()] reverses the effect.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [dplyr::group_by()], [data_dict_ungroup()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' transformed.
#' @param col variable to group by.
#'
#' @returns
#' A list of data frame(s) identifying a workable data dictionary structure.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#' # Create a list of data dictionaries where the column 'table' is added to
#' # refer to the associated dataset. The object created is not a
#' # data dictionary per say, but can be used as a structure which can be
#' # shaped into a data dictionary.
#'
#' data_dict_list <- list(
#' data_dict_1 <- madshapR_DEMO$data_dict_TOKYO ,
#' data_dict_2 <- madshapR_DEMO$data_dict_MELBOURNE)
#' names(data_dict_list) = c("dataset_TOKYO","dataset_MELBOURNE")
#'
#' data_dict_nest <- data_dict_list_nest(data_dict_list, name_group = 'table')
#'
#' data_dict_group_by(data_dict_nest, col = "table")
#'
#' }
#'
#' @import dplyr tidyr
#' @importFrom rlang .data
#'
#' @export
data_dict_group_by <- function(data_dict, col){
# test if enough data_dict
as_data_dict_shape(data_dict)
col <- substitute(col)
if(typeof(col) == "character") col <- as.symbol(col)
if(typeof(col) == "symbol") col <- substitute(col)
if(typeof(col) == "language") col <- as.symbol(col)
if(is.null(col)) return(data_dict)
group_names_var <-
c(sort(unique(data_dict[['Variables']][[col]]),na.last = TRUE))
categories <- TRUE
if(is.null(data_dict[['Categories']])) categories <- FALSE
if(sum(nrow(data_dict[['Categories']])) == 0){
data_dict[['Categories']] <-
tibble(col = as.character()) %>%
rename_with(.cols = "col", ~ deparse(col))}
group_names_cat <-
c(sort(unique(data_dict[['Categories']][[col]]),na.last = TRUE))
if(is.null(group_names_cat))
stop(call. = FALSE,
paste0("Column '",col,"' not found in Categories."))
if(!all(group_names_cat %in% group_names_var))
stop(call. = FALSE,
"\n\nThese data dictionaries contain group of variables in 'Categories' which
cannot be found accross the variables declared in 'Variables'.")
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
arrange(!! col) %>%
group_by(!! col)
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
arrange(!! col) %>%
group_by(!! col)
if(categories == FALSE) data_dict[['Categories']] <- NULL
return(data_dict)
}
#' @title
#' Ungroup data dictionary
#'
#' @description
#' Ungroups the data dictionary element(s). This function ungroups both the
#' 'Variables' and 'Categories' elements (if both are grouped data frames).
#' This function is analogous to running [dplyr::ungroup()].
#' [data_dict_group_by()] allows to group a data dictionary and this function
#' reverses the effect.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [dplyr::ungroup()]
#' [data_dict_group_by()]
#'
#' @param data_dict A list of data frame(s) representing metadata to be
#' transformed.
#'
#' @returns
#' A list of data frame(s) identifying a workable data dictionary structure.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#' # Create a list of data dictionaries where the column 'table' is added to
#' # refer to the associated dataset. The object created is not a
#' # data dictionary per say, but can be used as a structure which can be
#' # shaped into a data dictionary.
#'
#' library(dplyr)
#'
#' data_dict_list <- list(
#' data_dict_1 <- madshapR_DEMO$data_dict_TOKYO ,
#' data_dict_2 <- madshapR_DEMO$data_dict_MELBOURNE)
#' names(data_dict_list) = c("dataset_TOKYO","dataset_MELBOURNE")
#'
#' data_dict_nest <-
#' data_dict_list_nest(data_dict_list, name_group = 'table') %>%
#' data_dict_group_by(col = "table")
#'
#' data_dict_ungroup(data_dict_nest)
#' }
#'
#' @import dplyr tidyr
#' @importFrom rlang .data
#'
#' @export
data_dict_ungroup <- function(data_dict){
# test if enough data_dict
as_data_dict_shape(data_dict)
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
ungroup()
if(!is.null(data_dict[['Categories']]))
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
ungroup()
return(data_dict)
}
#' @title
#' Apply a data dictionary to a dataset
#'
#' @description
#' Applies a data dictionary to a dataset, creating a labelled dataset with
#' variable attributes. Any previous attributes will be preserved. For
#' variables that are factors, variables will be transformed into
#' haven-labelled variables.
#'
#' @details
#' A dataset is a data table containing variables. A dataset object is a
#' data frame and can be associated with a data dictionary. If no
#' data dictionary is provided with a dataset, a minimum workable
#' data dictionary will be generated as needed within relevant functions.
#' Identifier variable(s) for indexing can be specified by the user.
#' The id values must be non-missing and will be used in functions that
#' require it. If no identifier variable is specified, indexing is
#' handled automatically by the function.
#'
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' [attributes()], [haven::labelled()]
#'
#' @param dataset A dataset object.
#' @param data_dict A list of data frame(s) representing metadata of the input
#' dataset. Automatically generated if not provided.
#'
#' @returns
#' A labelled data frame with metadata as attributes, specified for each
#' variable from the input data dictionary.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' dataset <- madshapR_DEMO$dataset_MELBOURNE
#' data_dict <- as_data_dict_mlstr(madshapR_DEMO$data_dict_MELBOURNE)
#' head(data_dict_apply(dataset, data_dict))
#'
#' }
#'
#' @import dplyr tidyr stringr haven
#' @importFrom crayon bold
#' @importFrom rlang .data
#'
#' @export
data_dict_apply <- function(dataset, data_dict = NULL){
# if data_dict empty
if(is.null(data_dict)) data_dict <- data_dict_extract(dataset)
# test
as_dataset(dataset, attributes(dataset)$`madshapR::col_id`)
preserve_attributes <- attributes(dataset)$`madshapR::col_id`
if(toString(attributes(data_dict)$`madshapR::class`) == 'data_dict_mlstr'){
data_dict <-
as_data_dict_mlstr(data_dict,as_data_dict = TRUE, name_standard = FALSE)
}else{data_dict <- as_data_dict(data_dict)}
# names must exist both in dataset and data dictionary
# data dictionary is not applied to dataset, since it may come from an
# automated datadict (text by default).
if(suppressWarnings(check_dataset_variables(dataset, data_dict)) %>%
dplyr::filter(str_detect(.data$`condition`,"\\[ERR\\]")) %>% nrow > 0){
stop(call. = FALSE,
"Names across your data dictionary differ from names across the dataset.",
bold("\n\nUseful tip:"),
" Use dataset_evaluate(dataset, data_dict) to get a full assessment of
your dataset")}
# set cleaning prefix of Variables element
# (addition of Variables:: before all variables except name)
names(data_dict[['Variables']]) <-
make.unique(str_remove(names(data_dict[['Variables']]),"^Variables::"))
names(data_dict[['Variables']])[-
which(names(data_dict[['Variables']])=='name')] <-
paste0("Variables::",
names(data_dict[['Variables']]))[-
which(names(data_dict[['Variables']])=='name')]
# set cleaning prefix of Categories element
# (addition of Categories:: before all variables
# except variable, name, labels and na_values)
if(!is.null(data_dict[['Categories']])){
names(data_dict[['Categories']]) <-
make.unique(str_remove(names(data_dict[['Categories']]),"^Categories::"))
names(data_dict[['Categories']])[-
which(names(data_dict[['Categories']]) %in%
c('variable','name','labels', 'na_values'))] <-
paste0("Categories::",
names(data_dict[['Categories']])[-
which(names(data_dict[['Categories']]) %in%
c('variable','name','labels','na_values'))])}
names_data <- names(dataset)
names_data_dict <- data_dict[['Variables']]$`name`
for(i in names_data) {
# stop()}
vT_list <- madshapR::valueType_list
vT <- valueType_of(x = dataset[[i]])
dataset[[i]] <- as_valueType(x = dataset[[i]],valueType = vT)
attrs_init <- attributes(dataset[[i]])
attrs_var <-
data_dict[['Variables']][which(data_dict[['Variables']]$`name` == i),]
attrs_var <- attrs_var[vapply(X = attrs_var,
FUN = function(x) !all(is.na(x)),
FUN.VALUE = logical(1))]
attrs_var <- c(attrs_var) %>% unlist %>% as.list()
attrs_var <- attrs_var[names(attrs_var) != 'name']
attrs_cat <- list()
attrs_fct <- list()
attrs_na <- list(na_values = c())
if(!is.null(data_dict[['Categories']])) {
cat_i <-
data_dict[['Categories']][
which(data_dict[['Categories']]$`variable` == i),]
cat_i <- cat_i[vapply(X = cat_i,
FUN = function(x) !all(is.na(x)),
FUN.VALUE = logical(1))]
if(is.null(cat_i[['na_values']])) cat_i[['na_values']] <- NA
if(nrow(cat_i) > 0) {
# create vector of dataset
attributes(dataset[[i]])$`class` <- NULL
vec_data <-
try({as_valueType(x = cat_i$`name`, valueType = vT)},silent = TRUE)
if(class(vec_data)[1] == 'try-error') {
vT_cat <- valueType_guess(x = cat_i$`name`)
vec_data <- as_valueType(x = cat_i$`name`, valueType = vT_cat)}
names(vec_data) <- cat_i$`labels`
attrs_na <- list(na_values = vec_data[which(!is.na(cat_i$`na_values`))])
cat_i$`name` <- NULL
cat_i$`labels` <- NULL
cat_i$`variable` <- NULL
cat_i$`na_values` <- NULL
if(ncol(cat_i) > 0) {
for(j in seq_len(length(cat_i))){
# stop()}
vec_attr <- vec_data
names(vec_attr) <- cat_i[[j]]
vec_attr <- vec_attr[which(!is.na(cat_i[[j]]))]
attrs_cat[[names(cat_i[j])]] <- vec_attr
}}
# val_labels(dataset[[i]]) <- vec_data
attributes(dataset[[i]])$`labels` <- vec_data
attributes(dataset[[i]])$`class` <-
c("haven_labelled","vctrs_vctr",
vT_list[[which(vT_list$`valueType` == vT),"class"]])
attrs_fct <- attributes(dataset[[i]])
}}
attrs_total <- c(attrs_fct,attrs_na, attrs_init, attrs_var,attrs_cat)
attributes(dataset[[i]]) <- attrs_total
# suppression of na_values if empty
if(length(attrs_na[[1]]) == 0) attributes(dataset[[i]])$`na_values` <- NULL
}
dataset <-
dataset[names_data_dict] %>%
as_tibble() %>%
as_dataset(col_id = preserve_attributes)
return(dataset)
}
#' @title
#' Generate a data dictionary from a dataset
#'
#' @description
#' Generates a data dictionary from a dataset. If the dataset variables have no
#' associated metadata, a minimum data dictionary is created by using variable
#' attributes.
#'
#' @details
#' A dataset is a data table containing variables. A dataset object is a
#' data frame and can be associated with a data dictionary. If no
#' data dictionary is provided with a dataset, a minimum workable
#' data dictionary will be generated as needed within relevant functions.
#' Identifier variable(s) for indexing can be specified by the user.
#' The id values must be non-missing and will be used in functions that
#' require it. If no identifier variable is specified, indexing is
#' handled automatically by the function.
#'
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' The object may be specifically formatted to be compatible with additional
#' [Maelstrom Research software](https://maelstrom-research.org/page/software),
#' in particular [Opal environments](https://www.obiba.org/pages/products/opal/).
#'
#' @param dataset A dataset object.
#' @param as_data_dict_mlstr Whether the input data dictionary should be coerced
#' with specific format restrictions for compatibility with other
#' Maelstrom Research software. TRUE by default.
#'
#' @returns
#' A list of data frame(s) representing metadata of the dataset variables.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' ###### Example 2: extract data dictionary from any dataset (the
#' # data dictionary will be created upon attributes of the dataset. Factors
#' # will be considered as categorical variables)
#' data_dict_extract(iris)
#'
#' }
#'
#' @import dplyr tidyr stringr fabR
#' @importFrom rlang .data
#'
#' @export
data_dict_extract <- function(dataset, as_data_dict_mlstr = TRUE){
# test
as_dataset(dataset) # no col_id
if(!is.logical(as_data_dict_mlstr))
stop(call. = FALSE,
'`as_data_dict_mlstr` must be TRUE or FALSE (TRUE by default)')
dataset <- ungroup(dataset)
data_dict <-
list(
Variables = tibble(name = as.character()),
Categories = tibble(variable = as.character(), name = as.character()))
for(i in names(dataset)){
# stop()}
attrs_i <- attributes(dataset[[i]])
attrs_i$`tzone` <- NULL
attrs_i$`class` <- NULL
if(is.factor(dataset[[i]])){
names(attrs_i$`levels`) <- make.unique(attrs_i$`levels`)
}
data_dict_var <- tibble(name = i)
data_dict_cat <- tibble(variable = as.character())
if(length(attrs_i) > 0){
for(j in seq_len(length(attrs_i))){
# stop()}
attr_col_name <- attrs_i[j] %>% names
attr_content_col <- attrs_i[[j]] %>% names
attr_name_cat <- attrs_i[[j]] %>% as.character()
if(is.null(attr_content_col)) {
data_dict_var[attr_col_name] <-
paste0(attr_name_cat,collapse = " _; ")
}else{
cat_attr <- tibble(variable = i, name = attr_name_cat)
cat_attr[[attr_col_name]] <- attr_content_col
data_dict_cat <-
data_dict_cat %>%
full_join(cat_attr,
by = intersect(names(data_dict_cat),names(cat_attr)))
}
}
}
data_dict[['Variables']] <-
data_dict[['Variables']] %>% bind_rows(data_dict_var)
data_dict[['Categories']] <-
data_dict[['Categories']] %>% bind_rows(data_dict_cat)
}
names(data_dict[['Variables']]) <-
make.unique(str_remove(names(data_dict[['Variables']]),"^Variables::"))
names(data_dict[['Categories']]) <-
make.unique(str_remove(names(data_dict[['Categories']]),"^Categories::"))
if(sum(nrow(data_dict[['Categories']])) == 0) data_dict[['Categories']] <- NULL
# if(is.null(data_dict$Variables[['valueType']]) &
# is.null(data_dict$Variables[['typeof']])){
#
# data_dict <- silently_run(valueType_adjust(from = dataset, to = data_dict))
#
# }
if(is.null(data_dict$Variables[['valueType']])){
data_dict$Variables[['valueType']] <- NA
data_dict <- valueType_self_adjust(data_dict)
}
#
# data_dict <- silently_run(valueType_adjust(from = dataset, to = data_dict))
#
data_dict <-
valueType_adjust(from = dataset, to = data_dict) %>%
as_data_dict_mlstr(
as_data_dict = !as_data_dict_mlstr,
name_standard = FALSE)
return(data_dict)
}
#' @title
#' Inner join between a dataset and its associated data dictionary
#'
#' @description
#' Performs an inner join between a dataset and its associated data dictionary,
#' keeping only variables present in both. This function returns the matched
#' dataset rows, the matched data dictionary rows, or both, in a list.
#'
#' @details
#' A dataset is a data table containing variables. A dataset object is a
#' data frame and can be associated with a data dictionary. If no
#' data dictionary is provided with a dataset, a minimum workable
#' data dictionary will be generated as needed within relevant functions.
#' Identifier variable(s) for indexing can be specified by the user.
#' The id values must be non-missing and will be used in functions that
#' require it. If no identifier variable is specified, indexing is
#' handled automatically by the function.
#'
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @param dataset A dataset object.
#' @param data_dict A list of data frame(s) representing metadata of the input
#' dataset.
#' @param data_dict_apply Whether data dictionary(ies) should be applied to
#' associated dataset(s), creating labelled dataset(s) with variable attributes.
#' Any previous attributes will be preserved. FALSE by default.
#' @param output A vector of character string which indicates if the function
#' returns a dataset ('dataset'), data dictionary ('data_dict') of both.
#' Default is c('dataset','data_dict').
#'
#' @returns
#' Either a data frame, identifying the dataset, or a list of data frame(s)
#' identifying a data dictionary. Returns both in a list by default.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#' library(dplyr)
#'
#' dataset <- madshapR_DEMO$dataset_MELBOURNE %>% select(-1)
#' data_dict <- madshapR_DEMO$data_dict_MELBOURNE
#' head(data_dict_match_dataset(dataset, data_dict, out = 'dataset'))
#' glimpse(data_dict_match_dataset(dataset, data_dict, out = 'data_dict'))
#'
#' }
#'
#' @import dplyr tidyr
#' @importFrom rlang .data
#'
#' @export
data_dict_match_dataset <- function(
dataset,
data_dict,
data_dict_apply = FALSE,
output = c("dataset","data_dict")){
# test
as_data_dict_shape(data_dict)
as_dataset(dataset) # no col_id
if(!is.logical(data_dict_apply))
stop(call. = FALSE,
'`data_dict_apply` must be TRUE of FALSE (FALSE by default)')
names_data <-
paste0("name %in% c('",paste0(names(dataset),collapse = "','"),"')")
data_dict <- data_dict_filter(data_dict, filter_var = names_data)
dataset <- dataset %>% select(data_dict[['Variables']]$`name`)
if(length(dataset) == 0)
warning("No match found between dataset and data dictionary")
if(data_dict_apply == TRUE)
return(data_dict_apply(dataset, data_dict))
if(all(output[2:1] %in% c("dataset","data_dict")))
return(list(dataset = dataset, data_dict = data_dict))
if(output[1] == c("dataset"))
return(dataset)
if(output[1] == c("data_dict"))
return(data_dict)
# else
stop(call. = FALSE,
"`output` parameter must be either 'dataset' or 'data_dict'.
Leave blank to get both in a list.")
}
#' @title
#' Validate and coerce any object as a workable data dictionary structure
#'
#' @description
#' Validates the input object as a workable data dictionary structure and
#' returns it with the appropriate `madshapR::class` attribute. This function
#' mainly helps validate input within other functions of the package but could
#' be used to check if a data dictionary is valid for use in a function.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' For a better assessment, please use [data_dict_evaluate()].
#'
#' @param object A potential valid data dictionary to be coerced.
#'
#' @returns
#' A list of data frame(s) with `madshapR::class` 'data_dict_shape'.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$data_dict_PARIS
#' as_data_dict_shape(data_dict)
#'
#'}
#'
#' @import dplyr tidyr
#' @importFrom rlang .data
#'
#' @export
as_data_dict_shape <- function(object){
# if the Variables sheet is input in parameter
if(sum(names(object) %in% c('Variables')) == 1 & !is.data.frame(object)){
# name column must exist
if(is.null(object[['Variables']][['name']])){
stop(call. = FALSE,
"Column 'name' in 'Variables' is missing in your data dictionary.")}
# if Categories exists
if(!is.null(object[['Categories']])){
#, variable column must exist
if(is.null(object[['Categories']][['variable']])){
stop(call. = FALSE,
"Column 'variable' in 'Categories' is missing in your data dictionary.")}
#, name column must exist
if(is.null(object[['Categories']][['name']])){
stop(call. = FALSE,
"Column 'name' in 'Categories' is missing in your data dictionary.")}
}
# if all test pass:
attributes(object)$`madshapR::class` <- "data_dict_structure"
return(object)
}
# else
stop(call. = FALSE,
"\n\nThis object is not a data dictionary as defined by Maelstrom standards,
which must be a list containing at least 'Variables' list of elements.
Please refer to documentation.")
}
#' @title
#' Validate and coerce any object as a data dictionary
#'
#' @description
#' Checks if an object is a valid data dictionary and returns it with the
#' appropriate `madshapR::class` attribute. This function mainly helps validate
#' inputs within other functions of the package but could be used to check if
#' an object is valid for use in a function.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' For a better assessment, please use [data_dict_evaluate()].
#'
#' @param object A potential data dictionary object to be coerced.
#'
#' @returns
#' A list of data frame(s) with `madshapR::class` 'data_dict'.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$data_dict_PARIS
#' as_data_dict(data_dict)
#'
#'}
#'
#' @import dplyr tidyr stringr fabR
#' @importFrom crayon bold
#' @importFrom rlang .data
#'
#' @export
as_data_dict <- function(object){
data_dict <- as_data_dict_shape(object)
# variable names must be unique and non-null
if(check_data_dict_variables(data_dict) %>%
dplyr::filter(str_detect(.data$`condition`,"\\[ERR\\]")) %>% nrow > 0){
stop(call. = FALSE,
"Variable names must exist and be unique in your data dictionary.",
bold("\n\nUseful tip:"),
" Use data_dict_evaluate(data_dict) to get a full assessment of your
data dictionary")}
# variable names must be unique and non-null
if(sum(nrow(data_dict[['Categories']])) > 0){
if(check_data_dict_categories(data_dict) %>%
dplyr::filter(str_detect(.data$`condition`,"\\[ERR\\]")) %>% nrow > 0){
stop(call. = FALSE,
"Variable names in categories must exist and be unique in the data dictionary.",
bold("\n\nUseful tip:"),
" Use data_dict_evaluate(data_dict) to get a full assessment of your
data dictionary")}}
if(nrow(data_dict[['Variables']]) == 0){
data_dict <-
list(Variables = tibble(name = as.character(),typeof = as.character()))
attributes(data_dict)$`madshapR::class` <- "data_dict"
return(data_dict) }
# dataset shaping
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
ungroup() %>%
mutate(across(everything() ,~str_squish(.))) %>%
mutate(across(everything() ,~na_if(.,"")))
if(sum(nrow(data_dict[['Categories']])) > 0){
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
ungroup() %>%
mutate(across(everything(),~str_squish(.))) %>%
mutate(across(everything(),~na_if(.,"")))
}
# if not exists, addition of typeof for categorical variables, text else
if(length(data_dict[['Variables']][['typeof']]) == 0){
# test if vT exists and is good
if(length(data_dict[['Variables']][['valueType']]) > 0){
test_vT <-
data_dict[['Variables']] %>%
select('name', 'valueType') %>%
mutate(`valueType` = replace_na(.data$`valueType`,"character")) %>%
left_join(
madshapR::valueType_list %>%
select(
valueType = 'valueType',
typeof = 'typeof') %>%
distinct,
by = "valueType")
if(all(!is.na(test_vT))){
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
left_join(test_vT %>% select('name','typeof'), by = 'name')
}else if(sum(nrow(data_dict[['Categories']])) > 0){
category_outcomes <-
data_dict[['Categories']] %>%
select('name') %>% distinct %>%
rowwise() %>%
mutate(valueType = valueType_guess(.data$`name`))
category_outcomes <-
data_dict[['Categories']] %>%
select('variable','name') %>%
left_join(category_outcomes, by = "name") %>%
select('variable','valueType') %>%
distinct %>%
group_by(.data$`variable`) %>%
summarise(valueType = paste0(.data$`valueType`,collapse = "|"))
category_outcomes <-
data_dict[['Categories']] %>%
select('variable','name') %>%
left_join(category_outcomes, by = "variable") %>%
group_by(.data$`variable`) %>% group_split() %>% as.list() %>%
lapply(function(x){
test_vT <- str_detect(x$valueType[1], "\\|")
if(test_vT) x <- x %>% mutate(
valueType = valueType_guess(unique(x$name)))
return(x)
}) %>%
bind_rows() %>%
select('variable','valueType') %>% distinct %>%
left_join(madshapR::valueType_list, by = "valueType") %>%
select(name = 'variable','typeof')
data_dict[['Variables']] <-
left_join(data_dict[['Variables']],category_outcomes,by = "name") %>%
mutate(`typeof` = replace_na(.data$`typeof`,'character'))
}else{data_dict[['Variables']][['typeof']] <- 'character'}
}else{data_dict[['Variables']][['typeof']] <- 'character'}
}
if(sum(nrow(data_dict[['Categories']])) > 0){
# addition of valueType for sorting elements
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
select('variable','name') %>%
add_index('madshapR::index', .force = TRUE) %>%
left_join(data_dict[['Variables']] %>%
select(variable = 'name', 'typeof'), by = "variable") %>%
group_by(typeof) %>% group_split() %>% as.list %>%
lapply(function(x) {
test_name <- as_valueType(x$`name`, valueType_guess(unique(x$`name`)))
if(all(as.character(test_name) == as.character(x$name))){
x$name <- test_name}
x <- x %>% arrange(.data$`variable`, .data$`name`) %>%
mutate(name = as.character(.data$`name`))
return(x)}) %>%
bind_rows() %>%
select(-'typeof') %>%
left_join(
data_dict[['Categories']] %>%
add_index('madshapR::index',.force = TRUE) %>%
select(-'name'),by = c('madshapR::index', 'variable')) %>%
select(-'madshapR::index')
# add labels if not exists
if(length(data_dict[['Categories']][['labels']]) == 0){
if(length(data_dict[['Categories']][['levels']]) > 0){
# check if levels equals name (that means the levels are factors)
if(all(data_dict[['Categories']][['levels']] ==
data_dict[['Categories']][['name']],na.rm = TRUE)){
data_dict[['Categories']]['levels'] <- NULL}}
# name label as names
data_dict[['Categories']]['labels'] <- data_dict[['Categories']]['name']
} # else do nothing
# add na_values (as NA, will be removed anyway) if not exists
if(length(data_dict[['Categories']][['na_values']]) == 0){
# name label as names
data_dict[['Categories']]['na_values'] <- NA_character_
} # else do nothing
# gather haven and factors if levels remain
if(length(data_dict[['Categories']][['levels']]) > 0){
# check if levels isnt NA when labels is (recip.) and
# check if na_values is NA when levels is (recip.)
# (that means labels and levels are factors)
if(all(!is.na(data_dict[['Categories']][['levels']]) ==
is.na(data_dict[['Categories']][['labels']]))){
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
mutate(
labels =
ifelse(
!is.na(.data$`levels`) & is.na(.data$`na_values`),
.data$`levels`, .data$`labels`),
levels =
ifelse(
!is.na(.data$`levels`) & (.data$`levels` == .data$`labels`),
NA_character_ , .data$`levels`))}
}
# rearrange elements by missingness, then name, then variable
new_name <-
setdiff(
make.unique(c('missing',names(data_dict[['Categories']])))[-1],
names(data_dict[['Categories']]))
names(data_dict[['Categories']]) <-
make.unique(c("missing",names(data_dict[['Categories']])))[-1]
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
mutate(
missing = !is.na(.data$`na_values`),
missing = ifelse(is.na(.data$`missing`),FALSE,.data$`missing`)) %>%
mutate(
missing =
ifelse(
(.data$`name` < 0 & .data$`missing` == TRUE),
2, .data$`missing`)) %>%
group_by(.data$`variable`) %>%
arrange(.data$`variable`,.data$`missing`) %>%
ungroup() %>%
mutate(missing = ifelse(.data$`missing` == 2, 1, .data$`missing`)) %>%
mutate(missing = as.logical(.data$`missing`)) %>%
select(-'missing')
if(length(new_name) > 0)
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
rename_with(.cols = any_of(new_name) , .fn = ~ paste0('missing'))
}
# reorder things
# dataset shaping
data_dict[['Variables']] <-
data_dict[['Variables']] %>% select('name','typeof',everything())
data_dict[['Variables']] <-
data_dict[['Variables']][
vapply(X = data_dict[['Variables']],
FUN = function(x) !all(is.na(x)),
FUN.VALUE = logical(1))]
if(sum(nrow(data_dict[['Categories']])) > 0){
data_dict[['Categories']] <-
inner_join(
data_dict[['Variables']] %>%
select(variable = 'name'), data_dict[['Categories']],
by = "variable",multiple = "all") %>%
select('variable','name','labels',matches("^na_values$"), everything())
data_dict[['Categories']] <-
data_dict[['Categories']][vapply(
X = data_dict[['Categories']],
FUN = function(x) !all(is.na(x)),
FUN.VALUE = logical(1))]
}
# if all test pass:
attributes(data_dict)$`madshapR::class` <- "data_dict"
return(data_dict)
}
#' @title
#' Validate and coerce any object as an Opal data dictionary format
#'
#' @description
#' Validates the input object as a valid data dictionary compliant with formats
#' used in Maelstrom Research ecosystem, including Opal, and returns it with
#' the appropriate `madshapR::class` attribute. This function mainly helps
#' validate input within other functions of the package but could be used to
#' check if an object is valid for use in a function.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' The object may be specifically formatted to be compatible with additional
#' [Maelstrom Research software](https://maelstrom-research.org/page/software),
#' in particular [Opal environments](https://www.obiba.org/pages/products/opal/).
#'
#' @seealso
#' For a better assessment, please use [data_dict_evaluate()].
#'
#' @param object A potential valid data dictionary to be coerced.
#' @param as_data_dict Whether the input data dictionary should not be coerced
#' with specific format restrictions for compatibility with other
#' Maelstrom Research software. FALSE by default.
#' @param name_standard Whether the input data dictionary has variable names
#' compatible with Maelstrom Research ecosystem, including Opal)or not.
#' FALSE by default.
#'
#' @returns
#' A list of data frame(s) with `madshapR::class` 'data_dict_mlstr'.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$data_dict_MELBOURNE
#' as_data_dict_mlstr(madshapR_DEMO$data_dict_MELBOURNE)
#'
#' }
#'
#' @import dplyr tidyr fabR
#' @importFrom crayon bold
#' @importFrom rlang .data
#'
#' @export
as_data_dict_mlstr <- function(
object,
as_data_dict = FALSE,
name_standard = FALSE){
# test if data_dict is already data dictionary
data_dict <- as_data_dict(object)
if(!is.logical(as_data_dict))
stop(call. = FALSE,
'`as_data_dict` must be TRUE of FALSE (FALSE by default)')
# if valueType exists, vT must be valid
if(suppressWarnings(check_data_dict_valueType(data_dict)) %>%
dplyr::filter(str_detect(.data$`condition`,"\\[ERR\\]")) %>% nrow > 0){
stop(call. = FALSE,
"valueType are incompatible with Maelstrom standards.",
bold("\n\nUseful tip:"),
" Use data_dict_evaluate(data_dict) to get a full assessment of your
data dictionary")}
# check missing validity
if(suppressWarnings(check_data_dict_missing_categories(data_dict)) %>%
dplyr::filter(str_detect(.data$`condition`,"\\[ERR\\]")) %>% nrow > 0){
stop(call. = FALSE,
"\n
Incompatible missing value in the missing columns with Maelstrom standards",
bold(
"\n\nUseful tip:"),
" Use data_dict_evaluate(data_dict) to get a full assessment of your
data dictionary")}
# Check standard for names
if(name_standard == TRUE){
if(nrow(check_name_standards(data_dict[['Variables']][['name']])) > 0){
stop(call. = FALSE,
"names are incompatible with Maelstrom standards.",
bold("\n\nUseful tip:"),
" Use data_dict_evaluate(data_dict) to get a full assessment of your
data dictionary")}
}
# assess if tO is good
test_vT <-
data_dict[['Variables']] %>%
select('name', 'typeof') %>%
mutate(`typeof` = replace_na(.data$`typeof`,"character")) %>%
left_join(
madshapR::valueType_list %>%
select(
valueType = 'toValueType',
typeof = 'toTypeof') %>%
distinct,
by = "typeof")
# si pas bon de base, mettre un message et garder tO
if(!all(!is.na(test_vT))){
warning(
"The column 'typeof' in your data dictionary contains values that were
impossible to coerce in valueType. This column can be kept for further
investigations.",
"\n\nVariable(s) name : ",
toString(pull(test_vT[which(is.na(test_vT[['valueType']])),'name'])))
}else{data_dict[['Variables']][['typeof']] <- NULL}
# si vT existe
# add valueType if not exists
if(length(data_dict[['Variables']][['valueType']]) == 0){
# sinon on rajoute vT au dd
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
left_join(test_vT %>% select('name','valueType'), by = 'name')
} # else do nothing
# add label(:xx) if not present
lab_name_var <-
names(data_dict[['Variables']] %>%
select(matches(c("^label$","^label:[[:alnum:]]"))))
# add label if does not exists
if(length(lab_name_var) == 0){
data_dict[['Variables']] <-
data_dict[['Variables']] %>% mutate(label = .data$`name`)}
# fill labels if some are empty
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
mutate(across(any_of(matches(c("^label$","^label:[[:alnum:]]")))[1],
~ ifelse(is.na(.),.data$`name`,.)))
if(sum(nrow(data_dict[['Categories']])) > 0){
# addition of label(:xx) if not present
lab_name_var <-
names(data_dict[['Variables']] %>%
select(matches(c("^label$","^label:[[:alnum:]]"))))[1]
lab_name_cat <-
intersect(lab_name_var,
names(data_dict[['Categories']] %>%
select(matches(c("^label$","^label:[[:alnum:]]")))))
if(length(lab_name_cat) == 0){
# check presence of labels, if identical to name, NULL, rename else
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
rename_with(.cols = "labels", ~ lab_name_var)
}else if(all(data_dict[['Categories']][['labels']] ==
data_dict[['Categories']][['name']])) {
# preserve information from labels into label:xx
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
mutate(across(all_of(lab_name_cat), ~ ifelse(is.na(.),.data$`labels`,.)))
data_dict[['Categories']][['labels']] <- NULL}
# addition of missing if not present
missing_name <-
names(data_dict[['Categories']] %>% select(matches(c("^missing$"))))
if(length(missing_name) == 0){
# check presence of na_values, if identical to label, NULL, rename else
if(length(data_dict[['Categories']][['na_values']]) == 0){
data_dict[['Categories']]$`missing` <- FALSE
}else if(all(data_dict[['Categories']][['na_values']] ==
data_dict[['Categories']][['label']],na.rm = TRUE)){
data_dict[['Categories']]$`missing` <-
!is.na(data_dict[['Categories']]$`na_values`)
data_dict[['Categories']]$`na_values` <- NULL
}else{data_dict[['Categories']]$`missing` <- FALSE}
# check if missings and na_values are duplicated
}else if(all(!is.na(data_dict[['Categories']][['na_values']]) ==
data_dict[['Categories']][['missing']])){
data_dict[['Categories']]$`na_values` <- NULL
}
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
mutate(
missing = as_any_boolean(.data$`missing`),
missing = ifelse(is.na(.data$`missing`),FALSE,.data$`missing`)) %>%
mutate(
missing =
ifelse((
.data$`name` < 0 & .data$`missing` == TRUE),
2, .data$`missing`)) %>%
group_by(.data$`variable`) %>%
arrange(.data$`variable`,.data$`missing`) %>%
ungroup() %>%
mutate(missing = ifelse(.data$`missing` == 2, 1, .data$`missing`)) %>%
mutate(missing = as.logical(.data$`missing`))
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
select(
'variable','name',
matches(c("^label$","^label:[[:alnum:]]")),matches("^missing$"),
everything())
}
if(sum(nrow(data_dict[['Categories']])) == 0)
data_dict[['Categories']] <- NULL
if(as_data_dict == TRUE){
# # check if label and name are duplicated
# if(length(data_dict[['Variables']][['label']]) > 0){
# if(all(data_dict[['Variables']][['name']] ==
# data_dict[['Variables']][['label']])){
# data_dict[['Variables']]$`label` <- NULL}}
# valueType as typeof
if(length(data_dict[['Variables']][['typeof']]) == 0){
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
left_join(
madshapR::valueType_list %>%
select(
valueType = "valueType",
typeof = "toTypeof") %>%
distinct,
by = "valueType") %>%
select(-"valueType")}
if(sum(nrow(data_dict[['Categories']])) > 0){
# protection of labels if already exists
if(length(data_dict[['Categories']][['labels']]) > 0){
new_name <-
setdiff(
make.unique(c('labels',names(data_dict[['Categories']])))[-1],
names(data_dict[['Categories']]))
warning(
"The data dictionary contains 'labels' column, which usage is protected in R.
new name: ",new_name)
names(data_dict[['Categories']]) <-
make.unique(c('labels',names(data_dict[['Categories']])))[-1]
}
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
rename_with(.cols = starts_with("label")[1], ~ 'labels')
# protection of na_values if already exists
if(length(data_dict[['Categories']][['na_values']]) > 0){
new_name <-
setdiff(
make.unique(c('na_values',names(data_dict[['Categories']])))[-1],
names(data_dict[['Categories']]))
warning(
"The data dictionary contains 'na_values' column, which usage is protected in R.
New name: ",new_name)
names(data_dict[['Categories']]) <-
make.unique(c('na_values',names(data_dict[['Categories']])))[-1]
}
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
rename_with(.cols = "missing", ~ 'na_values') %>%
mutate(
na_values =
ifelse(.data$`na_values` == TRUE,.data$`labels`, NA_character_))
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
select(
'variable',
'name',
'labels',
'na_values',
everything())
}
data_dict[['Variables']] <-
data_dict[['Variables']] %>%
select('name','typeof',everything())
}
# reorder things
data_dict[['Variables']] <-
suppressMessages({left_join(
data_dict[['Variables']] %>%
select(
'name',
matches(c("^label$","^label:[[:alnum:]]")),
matches('^valueType$')),
data_dict[['Variables']][vapply(
X = data_dict[['Variables']],
FUN = function(x) !all(is.na(x)),
FUN.VALUE = logical(1))] %>%
bind_rows(tibble(name = as.character())))})
if(sum(nrow(data_dict[['Categories']])) > 0){
data_dict[['Categories']] <-
data_dict[['Categories']] %>%
left_join(data_dict[['Variables']] %>%
select(variable = 'name') %>%
add_index('madshapR::index'),
by = join_by('variable')) %>%
arrange(.data$`madshapR::index`) %>%
select(-'madshapR::index')
data_dict[['Categories']] <-
suppressMessages({left_join(
data_dict[['Categories']] %>%
select(
'variable','name',
matches(c("^labels$","^label$","^label:[[:alnum:]]"))),
data_dict[['Categories']][vapply(
X = data_dict[['Categories']],
FUN = function(x) !all(is.na(x)),
FUN.VALUE = logical(1))] %>%
bind_rows(tibble(variable = as.character())))})
}
if(as_data_dict == TRUE) {
attributes(data_dict)$`madshapR::class` <- "data_dict"
}else{
attributes(data_dict)$`madshapR::class` <- "data_dict_mlstr"}
return(data_dict)
}
#' @title
#' Test if an object is a workable data dictionary structure
#'
#' @description
#' Tests if the input object has adequate structure to work with functions
#' involving data dictionary shaping. This function mainly helps validate input
#' within other functions of the package but could be used to check if an
#' object is valid for use in a function.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' For a better assessment, please use [data_dict_evaluate()].
#'
#' @param object A potential data dictionary structure to be evaluated.
#'
#' @returns
#' A logical.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$data_dict_MELBOURNE
#' is_data_dict_shape(data_dict)
#' is_data_dict_shape(iris)
#'
#'}
#'
#' @import dplyr tidyr fabR
#' @importFrom rlang .data
#'
#' @export
is_data_dict_shape <- function(object){
# if only the data dictionary shape is given in parameter
test <- silently_run(try(as_data_dict_shape(object),silent = TRUE))
if(class(test)[1] == 'try-error') return(FALSE)
return(TRUE)
}
#' @title
#' Test if an object is a valid data dictionary
#'
#' @description
#' Tests if the input object is a valid data dictionary. This function mainly
#' helps validate input within other functions of the package but could be used
#' to check if an object is valid for use in a function.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' For a better assessment, please use [data_dict_evaluate()].
#'
#' @param object A potential data dictionary to be evaluated.
#'
#' @returns
#' A logical.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$data_dict_MELBOURNE
#' is_data_dict(data_dict)
#' is_data_dict(iris)
#'
#'}
#'
#' @import dplyr tidyr fabR
#' @importFrom rlang .data
#'
#' @export
is_data_dict <- function(object){
object <- object
# if only the data frame is given in parameter
test <- silently_run(try(as_data_dict(object),silent = TRUE))
if(class(test)[1] == 'try-error') return(FALSE)
return(TRUE)
}
#' @title
#' Test if an object is a valid Maelstrom data dictionary
#'
#' @description
#' Tests if the input object is a valid data dictionary compliant with formats
#' used in Maelstrom Research ecosystem, including Opal. This function mainly
#' helps validate input within other functions of the package but could be used
#' to check if an object is valid for use in a function.
#'
#' @details
#' A data dictionary contains the list of variables in a dataset and metadata
#' about the variables and can be associated with a dataset. A data dictionary
#' object is a list of data frame(s) named 'Variables' (required) and
#' 'Categories' (if any). To be usable in any function, the data frame
#' 'Variables' must contain at least the `name` column, with all unique and
#' non-missing entries, and the data frame 'Categories' must contain at least
#' the `variable` and `name` columns, with unique combination of
#' `variable` and `name`.
#'
#' @seealso
#' For a better assessment, please use [data_dict_evaluate()].
#'
#' @param object A potential Maelstrom formatted data dictionary to be
#' evaluated.
#'
#' @returns
#' A logical.
#'
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#'
#' data_dict <- madshapR_DEMO$data_dict_MELBOURNE
#' is_data_dict_mlstr(data_dict)
#' is_data_dict_mlstr(iris)
#'
#'}
#'
#' @import dplyr tidyr fabR
#' @importFrom rlang .data
#'
#' @export
is_data_dict_mlstr <- function(object){
object <- object
# if only the data frame is given in parameter
test <- silently_run(try(as_data_dict_mlstr(object),silent = TRUE))
if(class(test)[1] == 'try-error') return(FALSE)
return(TRUE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.