R/02-harmo_process_harmonization.R

Defines functions harmo_process

Documented in harmo_process

#' @title
#' Generate harmonized dataset(s) and associated metadata
#'
#' @description
#' Reads a DataSchema and Data Processing Elements to generate a 
#' harmonized dossier from input dataset(s) in a dossier and associated 
#' metadata. The function 
#' has one argument that can optionally be declared by the user 
#' (`unique_col_dataset`). It refers to the columns which contains name of 
#' each harmonized dataset. These two columns are added to ensure that there 
#' is always a unique entity identifier when datasets are pooled.
#'
#' @details
#' A dossier is a named list containing one or more data frames, which are 
#' input datasets. The name of each data frame in the dossier will be used as 
#' the name of the associated harmonized dataset produced by [harmo_process()].
#'
#' A DataSchema is the list of core variables to generate across datasets and 
#' related metadata. A DataSchema object is a list of data frames with elements 
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables' 
#' element must contain at least the `name` column, and the 'Categories' 
#' element must contain at least the `variable` and `name` columns to be usable 
#' in any function. In 'Variables' the `name` column must also have unique 
#' entries, and in 'Categories' the combination of `variable` and `name` columns 
#' must also be unique. 
#'
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param object Data frame(s) or list of data frame(s) containing input 
#' dataset(s).
#' @param dataschema A DataSchema object.
#' @param data_proc_elem A Data Processing Elements object.
#' @param harmonized_col_dataset A character string identifying the column 
#' to use for dataset names. NULL by default.
#' @param harmonized_col_id A character string identifying the name of the 
#' column present in every dataset to use as a dataset identifier. 
#' NULL by default.
#' @param .debug Allow user to test the inputs before processing harmonization.
#' @param dossier `r lifecycle::badge("deprecated")`
#'
#' @returns
#' A list of data frame(s), containing harmonized dataset(s). The DataSchema 
#' and Data Processing Elements are preserved as attributes of the 
#' output harmonized dossier.
#'
#' @examples
#' {
#' 
#' # Use Rmonize_DEMO to run examples.
#' 
#' library(dplyr)
#' library(madshapR) # data_dict_filter
#' 
#' dataset_MELBOURNE <- Rmonize_DEMO$dataset_MELBOURNE[1]
#' dossier <- dossier_create(list(dataset_MELBOURNE))
#' 
#' dataschema <- 
#'   Rmonize_DEMO$`dataschema - final` %>%
#'   data_dict_filter('name == "adm_unique_id"')
#' 
#' data_proc_elem <- Rmonize_DEMO$`data_processing_elements - final` %>%
#'   dplyr::filter(dataschema_variable == 'adm_unique_id',
#'          input_dataset == 'dataset_MELBOURNE')
#' 
#' # perform harmonization
#' harmonized_dossier <- harmo_process(dossier,dataschema,data_proc_elem)
#' glimpse(harmonized_dossier)
#' 
#' }
#'
#' @import dplyr tidyr fabR stringr
#' @importFrom rlang .data
#' @importFrom rlang :=
#' @importFrom rlang is_error
#' @importFrom rlang is_warning
#' @importFrom crayon bold
#'
#' @export
harmo_process <- function(
    object = NULL,
    dataschema = attributes(dossier)$`Rmonize::DataSchema`,
    data_proc_elem = attributes(dossier)$`Rmonize::Data Processing Elements`,
    harmonized_col_dataset = attributes(dossier)$`Rmonize::harmonized_col_dataset`,
    harmonized_col_id = attributes(dossier)$`Rmonize::harmonized_col_id`,
    .debug = FALSE,
    dossier = object
    ){
  
  # future dev
  # si le vT n'existe pas dans le Data Processing Elements, aller le chercher 
  # dans le DataSchema
  # controle de version ?
  
  if(is.null(object)) object <- dossier
  if(is.null(dossier)) dossier <- object
  
  if(.debug == FALSE){
    
    # check arguments
    if(is.null(object) | is.null(dataschema) | is.null(data_proc_elem))
      stop(call. = FALSE,
"

`object`, `dataschema` and `data_proc_elem` are mandatory and must be provided.
If you want to allow the code to run, you can specify 
harmo_process(object, dataset, dataprocelem, .debug = TRUE) to test your
input elements before processing harmonization.")

  }
  
  add_index_dossier <- function(dossier){
    
    dossier <- as_dossier(dossier)
    dataset_names <- names(dossier)
    
    col_ids <- 
      dossier %>% lapply(function(x) col_id(x)) %>% unique
    

    
    if(length(col_ids) > 1 | is.null(col_ids[[1]])){
      for(i in names(dossier)){
        # stop()}
        
        if(!("Rmonize::index" %in% names(dossier[[i]]))){
          dossier[[i]] <- 
            dossier[[i]] %>%
            add_index(name_index = "Rmonize::index",.force = TRUE) %>%
            mutate(`Rmonize::index` = paste0(i,".",.data$`Rmonize::index`)) %>%
            as_dataset(col_id = "Rmonize::index")          
        }else{
          dossier[[i]] <- 
            dossier[[i]] %>%
            as_dataset(col_id = "Rmonize::index")          
        }
      }}
    return(dossier)
  }
    
  data_proc_elem_get <- function(dossier){
    
    bind_dossier_names <- 
      dossier %>% 
      lapply(function(x) mutate(x[1,], across(everything(),as.character))) %>%
      bind_rows() %>%
      names
    
    total_len <- 
      dossier %>% 
      lapply(function(x) nrow(x)) %>%
      unlist() %>% sum
      # bind_rows() %>%
    
    bind_dossier <- tibble(.rows = total_len)
    
    for(i in seq_along(bind_dossier_names)){
      # stop()}
      
      cols_to_bind <- 
        dossier %>%
        lapply(function(x) select(x,any_of(bind_dossier_names[[i]])))
        
      bound_cols <- silently_run(bind_rows(cols_to_bind))
      
      if(class(bound_cols)[[1]] == 'try-error'){
        bound_cols <- 
          cols_to_bind %>%
          lapply(function(x) mutate(x, across(everything(),as.character))) %>%
          bind_rows() %>%
          valueType_self_adjust()}
      
        bind_dossier <- bind_dossier %>% bind_cols(bound_cols)
    }
        
    dataschema <- data_dict_extract(bind_dossier)
      
    data_proc_elem <-
      tibble(
        index = as.integer(),
        dataschema_variable = as.character(),
        input_dataset = as.character(),
        valueType = as.character(),
        input_variables = as.character(),
        `Mlstr_harmo::rule_category` = as.character(),
        `Mlstr_harmo::algorithm` = as.character())
    
    for(i in names(dossier)){
      # stop()}
      data_proc_elem_i <- 
        dataschema$Variables %>%
        rename("dataschema_variable" = "name") %>%
        mutate(
          input_dataset = i,
          input_variables = case_when(
            .data$`dataschema_variable` %in% names(dossier[[i]]) ~ .data$`dataschema_variable`,
            TRUE ~ "__BLANK__",
          ),
          `Mlstr_harmo::rule_category` = case_when(
            col_id(dossier[[i]]) == .data$dataschema_variable ~ "id_creation",
            .data$`dataschema_variable` %in% names(dossier[[i]]) ~ "direct_mapping",
            TRUE  ~ "impossible"
          ),
          `Mlstr_harmo::algorithm` = .data$`Mlstr_harmo::rule_category`) %>%
        add_index(.force = TRUE)  %>%
        select(-'label')
      
      data_proc_elem <- bind_rows(data_proc_elem,data_proc_elem_i)
      
    }
      
    data_proc_elem <- as_data_proc_elem(data_proc_elem)
    return(data_proc_elem)
  }

  extract_var <- function(x){
    x <- x %>%
      str_replace_all('"',"`") %>%
      str_replace_all("'","`") %>%
      str_remove_all("`") %>%
      str_squish()
    x = x[!is.na(x)]
    
    return(x)}
  
  # check arguments
  
  if(is_data_proc_elem(object)) 
    stop(call. = FALSE,'
This object is a Data Processing Elements. 
Please write harmo_process(data_proc_elem = my_object) instead.')
  
  if(is_dataschema(object)) 
    stop(call. = FALSE,'
This object is a DataSchema. 
Please write harmo_process(dataschema = my_object) instead.')
  
  if(is.null(object) & is.null(data_proc_elem) & is.null(dataschema))
    stop(call. = FALSE,"At least one element is missing.")
  
  if(!is.null(data_proc_elem))
    if(is.null(data_proc_elem[['input_dataset']]) | 
       is.null(data_proc_elem[['Mlstr_harmo::rule_category']]))
      if(is.null(data_proc_elem[['rule_category']]))
        as_data_proc_elem(data_proc_elem)
  
  # create dossier from data proc elem.
  
  if(is.null(object) & !is.null(data_proc_elem)){
    
    data_proc_elem <- as_data_proc_elem(data_proc_elem)
    name_datasets <- unique(data_proc_elem$input_dataset)
    object <- lapply(as.list(name_datasets),function(x) tibble())
    names(object) <- name_datasets
    
    for(i in name_datasets){
      # stop()}
      
      input_vars <- 
        unique(extract_var(
          data_proc_elem %>% 
            dplyr::filter(.data$`input_dataset` == i & 
                            .data$`input_variables` != '__BLANK__') %>%
            select("input_variables") %>%
            separate_longer_delim(cols = "input_variables",";") %>%
            pull("input_variables")))
      
      object[[i]] <- 
        as_tibble(data.frame(matrix(nrow=0,ncol=length(input_vars)))) %>%
        mutate(across(everything(),as.character))
      
      names(object[[i]]) <- input_vars
    }
    
    # harmonized_col_id <- extract_var(
    #   data_proc_elem %>%
    #     dplyr::filter(.data$`Mlstr_harmo::rule_category` == "id_creation") %>%
    #     pull("dataschema_variable") %>% unique)
    
    return(
      harmo_process(
        object = object,
        dataschema = dataschema,
        data_proc_elem = data_proc_elem,
        harmonized_col_dataset = harmonized_col_dataset,
        harmonized_col_id = harmonized_col_id,
        dossier = dossier,
        .debug = .debug))

  }
  
  if(is.null(object) & is.null(data_proc_elem) & !is.null(dataschema)){
    
    input_vars <- dataschema$Variables$name
    object <- as_tibble(data.frame(matrix(nrow=0,ncol=length(input_vars))))
    names(object) <- input_vars
    
    object <- valueType_adjust(from = dataschema,to = object)
    
    return(
      harmo_process(
        object = object,
        dataschema = dataschema,
        data_proc_elem = data_proc_elem,
        harmonized_col_dataset = harmonized_col_dataset,
        harmonized_col_id = harmonized_col_id,
        dossier = dossier,
        .debug = .debug))
  }
    
  if(is_dataset(dossier)) dossier <- list(dossier)
  
  if(is.null(names(dossier)))
    if(length(dossier) == 1 & !is.null(data_proc_elem))
      if(length(unique(data_proc_elem$input_dataset)) == 1 &
         !is.na(unique(data_proc_elem$input_dataset)))
        names(dossier) <- unique(data_proc_elem$input_dataset)
  
  if(is.null(names(dossier))){
    fargs <- as.list(match.call(expand.dots = TRUE))
    
    if(!is.null(fargs$object)){
      names(dossier) <- make_name_list(as.character(fargs['object']), dossier)
    }else{
      names(dossier) <- make_name_list(as.character(fargs['dossier']), dossier)
    }    
  }

  
  # check arguments. make sure col_id is harmonizable if exists
  dossier <- dossier_create(dossier)
  
  if(!is.null(data_proc_elem))
    if(length(dossier) == 1)
      if(length(unique(data_proc_elem$input_dataset)) == 1 |
         is.na(unique(data_proc_elem$input_dataset)))
        data_proc_elem$input_dataset <- names(dossier)
  
  if(!is.null(harmonized_col_id)){
    dossier <- 
      dossier %>% 
      lapply(function(x) 
        x %>% 
          as_dataset(harmonized_col_id)
    )}

  # check arguments
  # if(is.null(dataschema) & is.null(data_proc_elem) & !is.null(harmonized_col_id)){
  #   dossier <- 
  #     as_harmonized_dossier(
  #       dossier,
  #       harmonized_col_id = harmonized_col_id,
  #       harmonized_col_dataset = harmonized_col_dataset)  
  # return(harmo_process(dossier))}
  
  
  if(!is.null(data_proc_elem))
    if(length(dossier) == 1)
      if(length(unique(data_proc_elem$input_dataset)) == 1 &
         length(str_subset(data_proc_elem$`Mlstr_harmo::rule_category`,'id_creation')) == 0){
        
        if(is.null(harmonized_col_id)) 
          dossier <- add_index_dossier(dossier)
        
        id_creation <- data_proc_elem_get(dossier %>% lapply(function(x) x[1]))
        data_proc_elem <- bind_rows(id_creation,data_proc_elem)
         }

  # check arguments
  if(is.null(data_proc_elem)){
    dossier <- add_index_dossier(dossier)
    data_proc_elem <- data_proc_elem_get(dossier)
  }else{
    data_proc_elem <- as_data_proc_elem(data_proc_elem)
  }

  if(is.null(dataschema)){
    dataschema <- dataschema_extract(data_proc_elem)
  }else{
    dataschema <- as_dataschema_mlstr(dataschema)}
  
  # clean dataschema_variable and input dataset names
  data_proc_elem$dataschema_variable <- 
    extract_var(data_proc_elem$dataschema_variable)
  
  data_proc_elem$input_dataset <- 
    extract_var(data_proc_elem$input_dataset)
  
  # extraction of Data Processing Elements
  dpe <-
    data_proc_elem %>%
    rename(
      output_variable = "dataschema_variable",
      script          = "Mlstr_harmo::algorithm",
      rule_category   = "Mlstr_harmo::rule_category") %>%
    mutate(
      `script` = replace_na(.data$`script`, "undetermined"),
      `rule_category` = replace_na(.data$`rule_category`, "undetermined"),
      `rule_category` = ifelse(
        .data$`script` == "undetermined","undetermined",
        .data$`rule_category`),
      `script` = ifelse(
        .data$`rule_category` == "undetermined","undetermined",
        .data$`script`),
      `input_variables` = ifelse(
        .data$`rule_category` == "undetermined","__BLANK__",
        .data$`input_variables`))
  
  if(is.null(harmonized_col_id)){
    harmonized_col_id <- 
      unique(dpe[
        dpe$`rule_category` %in% 'id_creation',][[
          'output_variable']])}
  
  # test if harmonized_col_id exists in dpe and dataschema
  if(! harmonized_col_id %in% dataschema$Variables$name){
    
    dataschema_col_id <- data_dict_extract(dossier[[1]][1])
    dataschema$Variables <-
      bind_rows(dataschema_col_id$Variables,dataschema$Variables)
    
#     stop(call. = FALSE,
# '\n\nThe harmonized_col_id `',harmonized_col_id,'`',
# '\nmust be present in your DataSchema and in the Data Processing Elements.')
    
  }

  
  # test if harmonized_col_id exists in dpe and dpe
  if(! harmonized_col_id %in% data_proc_elem$dataschema_variable)
    stop(message('ERROR 100'))
#     stop(call. = FALSE,
# '\n\nThe harmonized_col_id `',harmonized_col_id,'`',
# '\nmust be present in your DataSchema and in the Data Processing Elements.')

  dpe <- dpe %>%
    group_by(.data$`input_dataset`) %>%
    group_split() %>%
    as.list
  names(dpe) <- sort(unique(bind_rows(dpe)$input_dataset))

  vars <- extract_var(unique(data_proc_elem$dataschema_variable))
  dataschema <- 
    data_dict_filter(
      dataschema,filter_var = 
        paste0(c("name %in% c('",paste0(vars,collapse = "','"),"')"),
               collapse = "")) %>%
    as_dataschema(as_dataschema_mlstr = TRUE)
  
  if(!is.null(harmonized_col_dataset)){
    
    # test if harmonized_col_dataset exists
    if(! harmonized_col_dataset %in% dataschema$Variable$name)
      stop(call. = FALSE,
'\n\nIf declared, the harmonized_col_dataset `',harmonized_col_dataset,'`',
'\nmust be present in your DataSchema and in the Data Processing Elements.')
    
  }
  
  if(length(intersect(names(dossier), names(dpe))) == 0){
    stop(call. = FALSE, 
'The dataset list to be harmonized is empty.
    
This usually means that your dataset names in the Data Processing Elements 
(in the column `dataset_input`) do not match the names in your dossier list. 
Please correct elements and reprocess.')
    
  }
  
  # intersection of dossier and dpe
  dossier <- dossier[intersect(names(dossier), names(dpe))]
  
  # selection of needed columns
  for(i in names(dossier)){
    # stop()}
  
    create_id_row <- 
      dpe[[i]][dpe[[i]]$`output_variable` %in% harmonized_col_id,]
    var_id <- extract_var(create_id_row[['input_variables']])
    
    names_in_dpe <- 
      str_squish(unlist(strsplit(
        dpe[[i]] %>% 
          dplyr::filter(
            .data$`input_variables` != '__BLANK__') %>%
          pull(.data$`input_variables`),split = ";"))) %>%
      unique %>% 
      extract_var
    
    dossier[[i]] <- try(as_dataset(dossier[[i]],col_id = var_id),silent = TRUE)
    
    if(class(dossier[[i]])[1] == 'try-error'){
      
      stop(message('ERROR 100'))
# stop(call. = FALSE, 
# 'In your Data Processing Elements, the input variable `',var_id,'` does not 
# exists in the input dataset `',create_id_row$`input_dataset`,'`.
# 
# This element is mandatory for the data processing to be initiated. 
# Please correct elements and reprocess.')
      
      
    }
    
    dossier[[i]] <- 
      dossier[[i]] %>% 
      select(all_of(col_id(dossier[[i]])),any_of(!! names_in_dpe))
  }
  
  # rid of data dictionary
  dossier <- 
    as_dossier(dossier) %>% 
    lapply(dataset_zap_data_dict)
  
  dpe <- dpe_init <- dpe[intersect(names(dossier), names(dpe))]
  
  # gather all information by dataset to be harmonized.
  harmonized_dossier <- harmonized_dossier_init <- 
    dpe %>% lapply(function(x){
      tbl <- tibble(!!! dataschema[['Variables']]$name, .rows = 0)
      names(tbl) <- dataschema[['Variables']]$name
      tbl <- tbl %>%
        select(all_of(harmonized_col_id), everything())
      return(tbl)})
  
  # creation of id
  for(i in names(harmonized_dossier)){
    # stop()}
    
    create_id_row <- 
      dpe[[i]][dpe[[i]]$`output_variable` %in% harmonized_col_id,]
    var_id <- extract_var(create_id_row[['input_variables']])
    
    dossier[[i]][var_id] <- 
      dossier[[i]][var_id] %>% 
      mutate(across(all_of(var_id),as.character)) %>%
      mutate(across(all_of(var_id) , ~ paste0('{',row_number(.),'}::',.)))
    
    harmonization_id <-
      dossier[[i]][var_id] %>%
      rename_with(
        .cols = all_of(var_id), ~ create_id_row$output_variable)
    
    harmonized_dossier[[i]] <-
      harmonized_dossier[[i]] %>% bind_rows(harmonization_id)
    
    harmonized_dossier[[i]] <- harmonized_dossier_init[[i]] <-
      as_dataset(
        harmonized_dossier[[i]],
        col_id = create_id_row$output_variable)
  }
  
  message(bold(
"- Data Processing Elements: ------------------------------------------------"))
  
  # recast <- tibble()
  harmonization_report <- harmonization_report_init <- 
    tibble(
      output_variable = as.character(),
      input_dataset = as.character(),
      input_variables = as.character(),
      rule_category = as.character(),
      script = as.character(),
      `Rmonize::r_script` = as.character())
  
  harmonized_dossier <- harmonized_dossier_init
  dpe <- dpe_init
  harmonization_report <- harmonization_report_init
  
  for (i in names(harmonized_dossier)) {
    # stop()}
    
    message(str_sub(paste0("\n",
"--harmonization of : ",
bold(i)," -----------------------------------------------------"),1,81))
    
    create_id_row <- 
      dpe[[i]][dpe[[i]]$`output_variable` %in% harmonized_col_id,]
    dataset_id <- extract_var(create_id_row[['input_variables']])
    input_dataset <- dossier[[i]]
    
    message(str_sub(paste0(
      str_trunc(paste0(
        "    processing ","1","/",nrow(dpe[[i]])," : ",
        harmonized_col_id),width = 49,ellipsis = '[...]'),
      "                                       "),1,50),
      bold("id created"))
    
    harmonization_report <-
      harmonization_report %>%
      bind_rows(
        create_id_row %>%
          mutate(
            `Rmonize::r_script` = paste0(
              "`",extract_var(.data$input_dataset),"`"," %>% \n",
              "  select('",!! harmonized_col_id,"' = '",
              dataset_id,"')")))
    
    # exclusion of id_creation rule for the rest of the process
    
    idx <- 1
    
    for(j in dpe[[i]][['output_variable']][!dpe[[i]]$`output_variable` %in%
                                           harmonized_col_id]){
      # stop()}
      
      process_rule  <- dpe[[i]][dpe[[i]]$output_variable == j,]
      idx <- idx + 1
      
      harmo_parse_process_rule <- 
      
      r_script <-
        get("harmo_parse_process_rule", envir = asNamespace("Rmonize"))(
          process_rule_slice = process_rule,
          input_dataset = input_dataset,
          r_script = TRUE)
      
      col <-
        get("harmo_parse_process_rule", envir = asNamespace("Rmonize"))(
          process_rule_slice = process_rule, 
          input_dataset = input_dataset, 
          r_script = FALSE)
      
      if(!is_error(col)){
        vT_test <- fabR::silently_run(
          as_valueType(unique(col[[j]]),
                       dataschema[['Variables']] %>%
                         dplyr::filter(.data$`name` == j) %>%
                         pull(.data$`valueType`)))
        
        if(is_error(attributes(vT_test)$condition))
          col <- attributes(vT_test)$condition}

      error_status <- NULL
      warning_status <- NULL
      
      if(is_error(col)){
        
        message(str_sub(paste0(
          str_trunc(paste0(
            "    processing ",idx,"/",nrow(dpe[[i]])," : ",
            process_rule$`output_variable`),width = 49,ellipsis = '[...]'),
          "                                       "),1,50),
          bold("**ERROR**"))
        
        error_status <- conditionMessage(col)
        
      }else{
        
        col <-
          col %>%
          rename_with(.cols = !! dataset_id,.fn = ~ harmonized_col_id) %>%
          mutate(across(!!harmonized_col_id, as.character))
        
        harmonized_dossier[[i]] <-
          harmonized_dossier[[i]] %>%
          select(- !! j) %>%
          full_join(col,by = harmonized_col_id)
        
        
        if(is_warning(attributes(col)$`Rmonize::warning`)){
          
          message(str_sub(paste0(
            str_trunc(paste0(
              "    processing ",idx,"/",nrow(dpe[[i]])," : ",
              process_rule$`output_variable`),width = 49,ellipsis = '[...]'),
            "                                       "),1,50),
            bold("**Warning(s)**"))
          
          warning_status <- conditionMessage(attributes(col)$`Rmonize::warning`)
          
        }else{
          
          status <- process_rule$`Mlstr_harmo::status`
          
          message(str_sub(paste0(
            str_trunc(paste0(
              "    processing ",idx,"/",nrow(dpe[[i]])," : ",
              process_rule$`output_variable`),width = 49,ellipsis = '[...]'),
            "                                       "),1,50),
            ifelse(status %in% 
                     c("undetermined",NA),bold(status),status)) 
          
        }
      }
      
      harmonization_report <-
        harmonization_report %>%
        bind_rows(
          process_rule %>%
            mutate(
              `Rmonize::error_status`   = error_status,
              `Rmonize::warning_status` = warning_status,
              `Rmonize::r_script` = 
                ifelse(.data$`rule_category` %in% c("undetermined",NA),
                       .data$`rule_category`, r_script),
              
              `Rmonize::r_script` = 
                str_replace_all(
                  .data$`Rmonize::r_script`, "input_dataset",.data$`input_dataset`),
              
              `Rmonize::r_script` = 
                str_replace(
                  .data$`Rmonize::r_script`, 
                  paste0("select\\('",var_id,"','",j,"'\\)$"),
                  paste0("select('",harmonized_col_id,"' = '",
                         var_id,"','",j,"')"))))
    }
  }
  
  
  # creation of id
  for(i in names(harmonized_dossier)){
    # stop()}
  
    id_row <- 
      dpe[[i]][dpe[[i]]$`output_variable` %in% harmonized_col_id,]
    
    dossier[[i]][id_row$input_variables] <- 
      dossier[[i]][id_row$input_variables] %>% 
      mutate(
        across(all_of(id_row$input_variables) , ~ 
        str_remove_all(.,pattern = '^[^\\}\\:\\:]*\\}\\:\\:')))
    
    harmonized_dossier[[i]][id_row$output_variable] <-
      harmonized_dossier[[i]][id_row$output_variable] %>% 
      mutate(
        across(all_of(id_row$output_variable) , ~ 
        str_remove_all(.,pattern = '^[^\\}\\:\\:]*\\}\\:\\:')))
  }
  
  data_proc_elem <- 
    harmonization_report %>%
    rename(
      "dataschema_variable"        = "output_variable" ,
      "Mlstr_harmo::algorithm"     = "script"          ,
      "Mlstr_harmo::rule_category" = "rule_category"   )
  
  harmonized_dossier <- 
    as_harmonized_dossier(
      harmonized_dossier,dataschema,data_proc_elem,
      harmonized_col_id,harmonized_col_dataset,
      harmonized_data_dict_apply = TRUE)
  
  nb_error <- if(!is.null(harmonization_report[['Rmonize::error_status']])){
    sum(!is.na(harmonization_report[['Rmonize::error_status']]))
  }else{0}
  
  if(nb_error > 0){
    message(
      "\n
------------------------------------------------------------------------------\n
Your harmonization process contains errors. The tables with error(s) have not 
been generated to avoid any version confusion. When harmonization elements and 
rules are corrected, rerun the process with the updated Data Processing Elements 
to generate complete harmonized tables.\n",
      
      bold("\n\nUseful tip: "),
"If you identified errors and want to correct them later, you can specify
'undetermined' in the column 'Mlstr_harmo::algorithm' of your problematic 
Data Processing Elements(s). Such Data Processing Elements will be ignored.\n")
    
    for(i in names(harmonized_dossier)){
      # stop()}
      
      nb_error_i <- nrow(dplyr::filter(
        harmonization_report[harmonization_report$input_dataset %in% i,],
        !is.na(.data$`Rmonize::error_status`)))
      
      if(nb_error_i > 0) harmonized_dossier[[i]] <- 
          harmonized_dossier[[i]] %>% slice(0)
    }
    
  }
  
  nb_undet <-
    nrow(dplyr::filter(harmonization_report,
                       is.na(.data$`Mlstr_harmo::status`) |
                       str_detect(.data$`Mlstr_harmo::status`, "undetermined")))
  
  if(nb_undet > 0){
    message(
      "\n
------------------------------------------------------------------------------\n
The harmonization process contains 'undetermined' statuses or harmonization 
rules. These variables will appear as empty columns in any harmonized tables 
generated. When harmonization statuses and rules are determined, rerun the 
process with the updated Data Processing Elements to generate complete 
harmonized tables.")}
  
  
  if(nb_error > 0){
    message(
      "\n
------------------------------------------------------------------------------\n
Your harmonization is done. Please check if everything worked correctly.\n")    
    
  }
  
  message(bold(
    "
- WARNING MESSAGES (if any): ----------------------------------------------\n"))
  
  return(harmonized_dossier)
  
}

#' @title
#' Internal function that generates code for harmonized dataset
#'
#' @description
#' Generates code for harmonized dataset.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#' @param input_dataset A data frame identifying the input table to be 
#' harmonized.
#' @param r_script R Script extracted form Data Processing Elements.
#'
#' @returns
#' A data frame identifying a harmonized dataset
#'
#' @import dplyr
#' @importFrom rlang .data
#' @noRd
harmo_parse_process_rule <- function(
    process_rule_slice, 
    input_dataset,
    r_script){

  if(is.na(process_rule_slice$`rule_category`))
    process_rule_slice$`rule_category` <- 'undetermined'
  
  process_rule_slice <- 
    process_rule_slice %>%
    mutate(
      to_eval_test =
        # paste0("try(harmo_process_",
        paste0("try(Rmonize:::harmo_process_",
               process_rule_slice$`rule_category`,
               "(process_rule_slice %>% slice(",row_number(),
               ")), silent = TRUE)" ))

  process_script <-
    paste0("`input_dataset` %>% \n",
           eval(parse(text = process_rule_slice$to_eval_test)), " %>%\n",
           "  select(",toString(paste0("'",col_id(input_dataset),"'")),",'",
                       process_rule_slice$output_variable,"')")

  if(r_script) return(process_script)

  process_script <- 

    tryCatch(
      
      {
        
        eval(parse(text = process_script))
        
      },
      error = function(e) {
        # Choose a return value in case of error
        return(e)
      },
      warning = function(w) {
        
        process_script <- eval(parse(text = process_script))
        attributes(process_script)$`Rmonize::warning` <- w
        
        # Choose a return value in case of warning
        return(process_script)
      }
    )
  
  return(process_script)
}

#' @title
#' Internal function for 'add_variable' in the process of harmonization
#'
#' @description
#' Processes 'add_variable' in the process of harmonization.
#' 
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#' @noRd
harmo_process_add_variable <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      = "left",
      to_eval_test     = paste0(
        .data$`output_table`," %>% left_join(\n",
        "  ",.data$`input_dataset`," %>% \n",
        "  mutate(\n",
        "  '",.data$`output_variable`,"' = ",.data$`replacement`,") %>% \n")
      ) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'case_when' in the process of harmonization
#'
#' @description
#' Processes 'case_when' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_case_when <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    # Rmonize_DEMO$`data_processing_elements - final` %>%
    # dplyr::filter(`Mlstr_harmo::rule_category` == 'case_when') %>%
    # slice(1) %>%
    # select(
    #   output_variable = dataschema_variable,
    #   script = `Mlstr_harmo::algorithm`) %>%
    # mutate(script ='case_when( "j\'ai aucune idée, mais _$ca va" ~ NA ; ELSE ~ $var)') %>%
    mutate(
      replacement      =
        .data$`script` %>% str_squish(),
      replacement      =
        str_replace_all(.data$`replacement`,"case_when\\(","case_when(\n     "),
      replacement      =
        str_replace_all(.data$`replacement`,";",",\n     "),
      replacement      =
        str_replace_all(.data$`replacement`,"ELSE","TRUE"),
      # replacement      =
      #   str_replace_all(.data$`replacement`,"\\$",
      #                   paste0(.data$`input_dataset`,"$")),
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,")")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'direct_mapping' in the process of harmonization
#'
#' @description
#' Processes 'direct_mapping' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_direct_mapping <- function(process_rule_slice){
  
  process_script_to_eval <-
    process_rule_slice %>%
    # process_rule %>%
    mutate(
      replacement      = str_remove_all(.data$`input_variables`,'`'),
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = `",.data$`replacement`,"`)")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'id_creation' in the process of harmonization
#'
#' @description
#' Creates id in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#' 
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_id_creation <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      = .data$`input_variables`,
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,")")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'impossible' in the process of harmonization
#'
#' @description
#' Processes 'impossible' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_impossible <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      = NA_character_,
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,")")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'merge' in the process of harmonization
#'
#' @description
#' Processes 'merge' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements.  
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_merge_variable <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      var_to_add   =
        harmo_process_add_variable(
          process_rule_slice %>% mutate(script = "left")),
      var_to_merge = harmo_process_case_when(process_rule_slice),
      var_to_merge =
        .data$`var_to_merge` %>% gsub('^.+?%>%(.*)', "\\1"),
      replacement  =
        paste0(.data$`var_to_add`," %>%",.data$`var_to_merge`),
      replacement  =
        str_replace_all(.data$`replacement`,"\\.old",".x"),
      replacement  =
        str_replace_all(.data$`replacement`,"\\.new",".y"),
      to_eval_test =
        paste0(
          .data$`output_table`," %>% left_join(\n",
          "  ",.data$`input_dataset`," %>% \n",
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,") %>% \n"
          )) %>%
    pull(.data$`to_eval_test`)
  
  return(process_script_to_eval$to_eval_test)
}

#' @title
#' Internal function for 'operation' in the process of harmonization
#'
#' @description
#' Processes 'operation' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_operation <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      =
        .data$`script` %>% str_squish(),
      replacement      =
        .data$`replacement` %>%
        str_replace_all("\\$",paste0(.data$`input_dataset`,"$")),
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,")")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'other' in the process of harmonization
#'
#' @description
#' Processes 'other' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_other <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      = .data$`script` %>% str_squish(),
      to_eval_test =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,")")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'paste' in the process of harmonization
#'
#' @description
#' Processes 'paste' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 

#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_paste <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      = .data$`script`,
      replacement      = str_squish(.data$`replacement`),
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,"
          )")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'recode' in the process of harmonization
#'
#' @description
#' Processes 'recode' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr 
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_recode <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    # process_rule %>%
    mutate(
      input_variables  = str_remove_all(.data$`input_variables`,'`'),
      replacement      =
        .data$`script` %>% str_squish(),
      replacement      =
        str_replace_all(.data$`replacement`,"\'","{madshapR::apostrophy}"),
      replacement      =
        str_replace_all(.data$`replacement`,",","{madshapR::coma}"),
      replacement      =
        gsub("^recode\\("," , '",.data$`replacement`),
      replacement      =
        gsub("ELSE","else ",.data$`replacement`),
      replacement      =
        gsub(")$","')",.data$`replacement`),
      replacement      =
        paste0("car::recode(\n      var = .$`",
               .data$`input_variables`,"`",.data$`replacement`),
      replacement      =
        str_replace_all(.data$`replacement`,"fun::",""),
      replacement      =
        str_replace_all(.data$`replacement`,",",",\n      recodes = "),
      
    replacement      =
      str_replace_all(.data$`replacement`,
                "car::recode\\(","car::recode(\n      as.numeric = FALSE ,"),
    replacement      =
      str_replace_all(.data$`replacement`,"\\{madshapR::apostrophy\\}","\\\\'"),
    replacement      =
      str_replace_all(.data$`replacement`,"\\{madshapR::coma\\}",","),
      
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`input_variables`,
          "' = stringr::str_squish(`",.data$`input_variables`,"`), \n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,")")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'rename' in the process of harmonization
#'
#' @description
#' Processes 'rename' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 

#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#' 
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_rename <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      = .data$`script`,
      to_eval_test     =
        paste0(
          .data$`output_table`," %>% \n",
          "  rename('",.data$`output_variable`,"' = '",
          .data$`input_variables`,"')")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Internal function for 'undetermined' in the process of harmonization
#'
#' @description
#' Processes 'undetermined' in the process of harmonization.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param process_rule_slice A one-rowed data frame, identifying the occurring
#' Data Processing Elements. 
#'
#' @returns
#' A character string of R code for harmonizing a specific column in
#' an input dataset. That line code is the transcription of the
#' harmonisation rule in the Data Processing Elements.
#'
#' @import dplyr stringr
#' @importFrom rlang .data
#'
#' @noRd
harmo_process_undetermined <- function(process_rule_slice){

  process_script_to_eval <-
    process_rule_slice %>%
    mutate(
      replacement      = NA_character_,
      to_eval_test     =
        paste0(
          "  mutate(\n",
          "  '",.data$`output_variable`,"' = ",.data$`replacement`,")")) %>%
    pull(.data$`to_eval_test`)

  return(process_script_to_eval)
}

#' @title
#' Print a summary of data processing in the console
#'
#' @description
#' Reads a harmonized dossier, product of [harmo_process()], to list processes, 
#' any errors, and an overview of each harmonization rule. The output printed 
#' in the console can help in correcting any errors that occurred during 
#' data processing.
#'
#' @details
#' A harmonized dossier is a named list containing one or more data frames, 
#' which are harmonized datasets. A harmonized dossier is generally the 
#' product of applying processing to a dossier object The name of each 
#' harmonized dataset (data frame) is taken from the reference input dataset. 
#' A harmonized dossier also contains the DataSchema and 
#' Data Processing Elements used in processing as attributes.
#'
#' @param harmonized_dossier A list containing the harmonized dataset(s).
#' @param show_warnings Whether the function should print warnings or not. 
#' TRUE by default.
#'
#' @returns
#' Nothing to be returned. The function prints messages in the console, 
#' showing any errors in the processing.
#'
#' @examples
#' {
#'
#'   harmonized_dossier <- Rmonize_DEMO$harmonized_dossier
#'   show_harmo_error(harmonized_dossier)
#' 
#' }
#'
#' @import dplyr tidyr stringr
#' @importFrom rlang .data
#' @importFrom crayon bold
#' @importFrom crayon green
#' @importFrom utils capture.output
#'
#' @export
show_harmo_error <- function(harmonized_dossier, show_warnings = TRUE){

  # list of primary error in the Data Processing Elements.
  # print the list of error + the index

  # check input
  as_harmonized_dossier(harmonized_dossier)
  
  data_proc_elem <- 
    as_data_proc_elem(
      attributes(harmonized_dossier)[['Rmonize::Data Processing Elements']])

  for(i in names(harmonized_dossier)){
    # stop()}

    if(nrow(harmonized_dossier[[i]]) > 0){
      tbl <- 
        harmonized_dossier[[i]] %>% 
        summarise(across(everything(), ~all(is.na(.)))) %>%
        pivot_longer(everything()) %>%
        dplyr::filter(.data$`value` == TRUE) %>%
        select('dataschema_variable' = 'name','empty' = 'value')
      dpe <- 
        data_proc_elem %>% 
        dplyr::filter(.data$`input_dataset` == i & 
             !(.data$`Mlstr_harmo::rule_category` %in% 
                   c('impossible','undetermined'))) %>%
        select('dataschema_variable') %>% distinct
      
      empty_col <- inner_join(tbl,dpe, by = 'dataschema_variable') 
      status <- 'Warning: The variable generated is NA'
      
      data_proc_elem <- 
        data_proc_elem %>%
        mutate(
          `Mlstr_harmo::status` =
            ifelse(
              .data$`dataschema_variable` %in% empty_col$dataschema_variable &
              .data$`input_dataset` == i,
              status, .data$`Mlstr_harmo::status`))
    }
    
  } 
  
  report_log <-
    data_proc_elem %>%
    select(
      "dataschema_variable", "input_dataset",
      "Mlstr_harmo::algorithm",
      "Mlstr_harmo::rule_category", "Rmonize::r_script",
      matches("Rmonize::error_status"),
      matches("Rmonize::warning_status")) %>%
    bind_rows(tibble(
      "Rmonize::error_status" = as.character(),
      "Rmonize::warning_status" = as.character())) %>%
    dplyr::filter(!is.na(.data$`Rmonize::error_status`) |
                    !is.na(.data$`Rmonize::warning_status`)) %>%
    mutate(error = !is.na(.data$`Rmonize::error_status`),
           warning = !is.na(.data$`Rmonize::warning_status`)) %>%
    unite(col = "Rmonize::status",
          "Rmonize::error_status","Rmonize::warning_status",
          na.rm = TRUE) %>%
    mutate(
      var = paste0(.data$`dataschema_variable`, " in ",
                   .data$`input_dataset`, " \n"),
      rule = paste0(.data$`Mlstr_harmo::algorithm`))

  if(show_warnings == FALSE){
    report_log <-
      report_log %>%
      dplyr::filter(.data$warning != TRUE)
  }
  
  if(nrow(report_log) > 0){
    message(bold(
paste0("\n
- ERROR",ifelse(show_warnings == TRUE,"/WARNING",""),
" STATUS DETAILS: -------------------------------------------\n")),
paste0("\nHere is the list of the errors",ifelse(show_warnings == TRUE," and warnings",""),
" encountered in the process 
of harmonization:\n"))
    
    for(i in seq_len(nrow(report_log))){
      message(
"---------------------------------------------------------------------------\n")
      message(bold(report_log$var[i]))
      message(bold("\nRule Category:"))
      message(report_log$`Mlstr_harmo::rule_category`[i])
      message(bold("\nAlgorithm:"))
      message(report_log$rule[i])
      message(bold("\nR Script:"))
      message(report_log$`Rmonize::r_script`[i])
      message(green(bold("\nR Error/Warning:")))
      message(green(report_log$`Rmonize::status`[i]))
    }
  }
  
  report_log <-  
    report_log %>% 
    group_by(.data$`error`,.data$`warning`) %>%
    count(.data$`Mlstr_harmo::rule_category`) %>%
    full_join(data_proc_elem %>%
                count(.data$`Mlstr_harmo::rule_category`),
              by = "Mlstr_harmo::rule_category") %>%
    ungroup %>%
    mutate(
      n.x = replace_na(.data$`n.x`,0),
      `Total number of errors`   =ifelse(.data$`error` %in% TRUE,.data$`n.x`,0),
      `Total number of warnings` =ifelse(.data$`warning` %in% TRUE,.data$`n.x`,0), 
      success = trunc((1 - .data$`Total number of errors`/.data$`n.y`)*100)) %>%
    arrange(.data$`success`) %>%
    mutate(success = paste0(.data$`success`," %")) %>%
    rename(`Total number of algorithms` = "n.y") %>%
    # mutate_all(as.character) %>%
    select(-'error',-'warning',-'n.x')
  
  if(sum(report_log$`Total number of errors`) == 0)
    report_log$`Total number of errors` <- NULL
  
  if(sum(report_log$`Total number of warnings`) == 0)
    report_log$`Total number of warnings` <- NULL
  
  message(bold(
"\n\n- STATUS SUMMARY: ----------------------------------------------------\n"))
  message(paste(capture.output({print(
    report_log %>%
      mutate(across(everything(), as.character))
  )}), collapse = "\n"))

  return(message(""))
}

#' @title
#' Generate a DataSchema based on Data Processing Elements
#'
#' @description
#' Generates a DataSchema from a Data Processing Elements.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param data_proc_elem A Data Processing Elements object.
#' 
#' @returns
#' A list of data frame(s) named 'Variables' and (if any) 'Categories', with 
#' `Rmonize::class` 'dataschema'.
#'
#' @examples
#' {
#'
#' # Use Rmonize_DEMO to run examples.
#' library(dplyr)
#'
#' glimpse(dataschema_extract(
#'   data_proc_elem = Rmonize_DEMO$`data_processing_elements - final`))
#' }
#'
#' @import dplyr
#' @importFrom rlang .data
#'
#' @export
dataschema_extract <- function(data_proc_elem){

  data_proc_elem <- as_data_proc_elem(data_proc_elem)

  ## creation of the DataSchema in a list provided from the 
  ## dataprocessing element variables

  dataschema <- list(
    Variables =
      data_proc_elem[
        c('dataschema_variable','valueType')] %>%
      rename(name = 'dataschema_variable') %>%
      distinct) %>%
    as_dataschema(as_dataschema_mlstr = TRUE)

#   warning(call. = FALSE,
# "Your project does not include the DataSchema. An automatic DataSchema 
#  and harmonized data dictionary have been created based on your 
#  Data Processing Elements." )

  return(dataschema)
}

#' @title
#' Validate and coerce as a Data Processing Elements object
#'
#' @description
#' Checks if an object is a valid Data Processing Elements and returns it with 
#' the appropriate `Rmonize::class` attribute. This function mainly helps 
#' validate inputs within other functions of the package but could be used 
#' separately to ensure that an object has an appropriate structure.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param object A potential Data Processing Elements object to be coerced.
#'
#' @returns
#' A data frame with `Rmonize::class` 'data_proc_elem'.
#'
#' @examples
#' {
#'
#' # Use Rmonize_DEMO to run examples.
#' library(dplyr)
#'
#' glimpse(head(as_data_proc_elem(Rmonize_DEMO$`data_processing_elements - final`),3))
#' 
#' }
#'
#' @import dplyr fabR tidyr
#' @importFrom rlang .data
#'
#' @export
as_data_proc_elem <- function(object){

  object_init <- object
  
  if(is.data.frame(object)){
    
    # if('Mlstr_harmo::algorithm' %in% names(object)){
      object <- 
        object %>%
        rename_with(
          .cols = everything(),
          .fn = ~ case_when(
            . == 'ss_table'            ~ 'input_dataset',
            . == 'ss_variables'        ~ 'input_variables',
            . == 'harmo_rule'          ~ 'Mlstr_harmo::algorithm',
            . == 'harmo_status'        ~ 'Mlstr_harmo::status',
            . == 'harmo_status_detail' ~ 'Mlstr_harmo::status_detail',
            . %in% c('status'       ,
                     'status_detail',
                     'comment'      ,
                     'algorithm'    ,
                     'rule_category')  ~ paste0('Mlstr_harmo::',.),
            TRUE                       ~ .))
      
    # }
  }
  
  if(sum(names(object) %in% 
         c('dataschema_variable',
           'input_dataset',
           'input_variables',
           'Mlstr_harmo::rule_category',
           'Mlstr_harmo::algorithm'
           )) == 5 & is.data.frame(object)){
    
    # up to date object_init 
    if(sum(names(object_init) %in% 
           c('ss_table','ss_variables','harmo_rule')) > 0 ){
    
      warning(call. = FALSE,
"\n\nSome column names in your Data Processing Elements have been deprecated 
since the last version of the package. To avoid this warning, we recommend to 
change column names into the newest ones as defined by Maelstrom standards :

     Old column names                New column names
  ======================   >   =============================
  
    ss_table               >     input_dataset
    ss_variables           >     input_variables
    rule_category          >     Mlstr_harmo::rule_category
    harmo_rule             >     Mlstr_harmo::algorithm
    harmo_status           >     Mlstr_harmo::status
    harmo_status_detail    >     Mlstr_harmo::status_detail
    comment                >     Mlstr_harmo::comment
   
   
Please refer to documentation.")
    
    }
    

    # no NA in dataschema_variable
    if(sum(is.na(unique(object$dataschema_variable))) > 0)
      stop(call. = FALSE,
'The column `dataschema_variable` of your Data Processing Elements must not 
contain NA values.')
    
    # no NA in input_dataset
    if(sum(is.na(unique(object$input_dataset))) > 0)
      stop(call. = FALSE,
'The column `input_dataset` of your Data Processing Elements must not 
contain NA values.')
    
    # id_creation rule must exist
    object %>%
      group_by(.data$`input_dataset`) %>% group_split() %>% as.list() %>%
      lapply(function(x){
      col_id <- 
        x[x$`Mlstr_harmo::rule_category` %in% 'id_creation',][['dataschema_variable']]
      
      if(length(col_id) == 0)
        stop(call. = FALSE,
'In ',unique(x$input_dataset),': \n\n',
'In the column `Mlstr_harmo::rule_category` of your Data Processing Elements, 
"id_creation" rule is missing. The harmonization process cannot start because 
this step initiates the process for each of your processed datasets. 
Your first processing element must be "id_creation" in `Mlstr_harmo::rule_category` 
followed by the name of the column taken as identifier of dataset in your 
dossier.')
      
      if(length(col_id)  > 1)
        stop(call. = FALSE,
'In ',unique(x$input_dataset),': \n\n',
'In the column `Mlstr_harmo::rule_category` of your Data Processing Elements, 
"id_creation" rule is not unique. The harmonization process cannot start because 
this step initiates the process and must be unique for each of your processed 
dataset(s). Your first processing element must be "id_creation" in 
`Mlstr_harmo::rule_category` followed by the name of the column taken as identifier of 
dataset in your dossier.')
    })
    
  nb_col_id <- 
    unique(
      object[
        object$`Mlstr_harmo::rule_category` %in% 'id_creation',][[
          'dataschema_variable']])
  
  if(length(nb_col_id) > 1)
    stop(call. = FALSE,
'In ',unique(object$input_dataset),': \n\n',
'In the column `Mlstr_harmo::rule_category` of your Data Processing Elements, 
"id_creation" is not unique across the harmonized dataset(s) to generate. 
The harmonization process cannot start because this step initiates the process 
for each of your processed datasets and must be the same variable across the 
generated harmonized dataset(s). Your first processing element must be 
"id_creation" in `Mlstr_harmo::rule_category` followed by the name of the column taken 
identifier of dataset in your dossier.')
  
  # harmo status must be either NA, complete, impossible or undetermined

  # step cleaning : addition of missing columns
  object <-
    object %>%
    bind_rows(tibble(
      `valueType` = as.character(),
      `Mlstr_harmo::status` = as.character(),
      `Mlstr_harmo::status_detail` = as.character(),
      `Mlstr_harmo::comment` = as.character()))

  
  # step cleaning : addition of `Mlstr_harmo::rule_category`.
  object <-
    object %>%
    mutate(
      `Mlstr_harmo::rule_category` = case_when(
        if_any(c("input_variables",
                 "Mlstr_harmo::rule_category",
                 "Mlstr_harmo::algorithm"), ~ . == "undetermined") ~ 
          'undetermined',
        is.na(`Mlstr_harmo::rule_category`)                        ~ 
          'undetermined',
        TRUE                                                       ~ 
          `Mlstr_harmo::rule_category`))
  
  # step cleaning : addition of Mlstr_harmo::status.
  object <-
    object %>%  
    mutate(
      `Mlstr_harmo::status` = case_when(
        if_any(c("input_variables",
                 "Mlstr_harmo::rule_category",
                 "Mlstr_harmo::algorithm"), ~ . == "undetermined") ~ 
          'undetermined',
        if_any(c("Mlstr_harmo::rule_category",
                 "Mlstr_harmo::algorithm"), ~ . == "impossible")   ~ 
          'impossible',
        is.na(`Mlstr_harmo::status`)                               ~ 
          'complete',
        TRUE                                                       ~ 
          .data$`Mlstr_harmo::status`))
    
  # step cleaning : addition of Mlstr_harmo::status_detail.
  object <-
    object %>%  
    mutate(
      `Mlstr_harmo::status_detail` = case_when(
        if_any(c("input_variables",
                 "Mlstr_harmo::rule_category",
                 "Mlstr_harmo::algorithm"), ~ . == "undetermined") ~ 
          'undetermined',
        is.na(`Mlstr_harmo::status_detail`)                        ~ 
          'unknown',
        TRUE                                                       ~ 
          .data$`Mlstr_harmo::status_detail`))
  
  # step cleaning : id_creation must be the first entry
  object <-
    object %>%
    group_by(.data$`input_dataset`) %>%
    group_modify(.f = ~ 
                bind_rows(
                  dplyr::filter(., `Mlstr_harmo::rule_category` %in% 'id_creation'),
                  dplyr::filter(.,!`Mlstr_harmo::rule_category` %in% 'id_creation')))
    
  if(is.null(object[['index']])){
    object <-
      object %>%
      add_index()}

  # step cleaning : addition of index
  if(is.null(object[['index']])){
    object <-
      object %>%
      add_index()}
  
  # if all test pass:
  object <- 
    object %>%
    ungroup %>%
    select('index',
           'dataschema_variable',
           'valueType',
           'input_dataset',
           'input_variables',
           'Mlstr_harmo::rule_category',
           'Mlstr_harmo::algorithm',
           'Mlstr_harmo::status',
           'Mlstr_harmo::status_detail',
           'Mlstr_harmo::comment',
           everything())
    
    attributes(object)$`Rmonize::class` <- "data_proc_elem"
    return(object)

  }

  # dataschema_variable column must exist
  if(is.null(object[['dataschema_variable']])){
    stop(call. = FALSE,
"Column 'dataschema_variable'in your Data Processing Elements is missing.")}
  
  # input_dataset column must exist
  if(is.null(object[['input_dataset']])){
    stop(call. = FALSE,
"Column 'input_dataset' in your Data Processing Elements is missing.")}
  
  # input_variables column must exist
  if(is.null(object[['input_variables']])){
    stop(call. = FALSE,
"Column 'input_variables' in your Data Processing Elements is missing.")}
  
  # Mlstr_harmo::rule_category column must exist
  if(is.null(object[['Mlstr_harmo::rule_category']])){
    stop(call. = FALSE,
"Column 'Mlstr_harmo::rule_category' in your Data Processing Elements is missing.")}
  
  # Mlstr_harmo::algorithm column must exist
  if(is.null(object[['Mlstr_harmo::algorithm']])){
    stop(call. = FALSE,
"Column 'Mlstr_harmo::algorithm' in your Data Processing Elements is missing.")}
  
  # else
  stop(call. = FALSE,
"\n\nThis object is not a Data Processing Elements as defined by 
Maelstrom standards, which must be a data frame containing at least these columns:
  `dataschema_variable`
  `input_dataset`
  `input_variables`
  `Mlstr_harmo::rule_category`
  `Mlstr_harmo::algorithm`
  
Please refer to documentation.")

  return(object)
}

#' @title
#' Validate and coerce as a DataSchema object
#'
#' @description
#' Checks if an object is a valid DataSchema and returns it with the appropriate 
#' `Rmonize::class` attribute. This function mainly helps validate inputs within 
#' other functions of the package but could be used separately to ensure that an 
#' object has an appropriate structure.
#'
#' @details
#' A DataSchema is the list of core variables to generate across datasets and 
#' related metadata. A DataSchema object is a list of data frames with elements 
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables' 
#' element must contain at least the `name` column, and the 'Categories' 
#' element must contain at least the `variable` and `name` columns to be usable 
#' in any function. In 'Variables' the `name` column must also have unique 
#' entries, and in 'Categories' the combination of `variable` and `name` columns 
#' must also be unique. 
#' 
#' The object may be specifically formatted to be compatible with additional 
#' [Maelstrom Research software](https://maelstrom-research.org/page/software), 
#' in particular [Opal environments](https://www.obiba.org/pages/products/opal/).
#'
#' @param object A potential DataSchema object to be coerced.
#' @param as_dataschema_mlstr Whether the output DataSchema should be coerced 
#' with specific format restrictions for compatibility with other 
#' Maelstrom Research software. FALSE by default.
#'
#' @returns
#' A list of data frame(s) named 'Variables' and (if any) 'Categories', 
#' with `Rmonize::class` 'dataschema'.
#'
#' @examples
#' {
#'
#' # Use Rmonize_DEMO to run examples.
#' library(dplyr)
#'
#' glimpse(as_dataschema(Rmonize_DEMO$`dataschema - final`))
#' 
#' }
#'
#' @import dplyr
#' @importFrom rlang .data
#'
#' @export
as_dataschema <- function(object, as_dataschema_mlstr = FALSE){

  if(!is.logical(as_dataschema_mlstr))
    stop(call. = FALSE,
         '`as_dataschema_mlstr` must be TRUE or FALSE (FALSE by default)')

  if(is_data_dict(object)){
    
    if(as_dataschema_mlstr == TRUE){
      if(is_data_dict_mlstr(object)){
        object <- as_data_dict_mlstr(object,name_standard = FALSE)
        attributes(object)$`Rmonize::class` <- "dataschema_mlstr"
        return(object)}
    }else{
      object <- as_data_dict(object)
      attributes(object)$`Rmonize::class` <- "dataschema"
      return(object)
    }
    
  }
  
  stop(call. = FALSE,
         "

The DataSchema contains errors or does not conform to the required structure, 
Please refer to documentation.\n\n",
bold("Useful tip:\n"),
"Use dataschema_evaluate(dataschema) to get an assessment of your DataSchema")
  
}

#' @title
#' Validate and coerce as a DataSchema object with specific format restrictions
#'
#' @description
#' Checks if an object is a valid DataSchema with specific format restrictions 
#' for compatibility with other Maelstrom Research software and returns it with 
#' the appropriate `Rmonize::class` attribute. This function mainly helps validate 
#' inputs within other functions of the package but could be used separately to 
#' ensure that an object has an appropriate structure.
#'
#' @details
#' A DataSchema is the list of core variables to generate across datasets and 
#' related metadata. A DataSchema object is a list of data frames with elements 
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables' 
#' element must contain at least the `name` column, and the 'Categories' 
#' element must contain at least the `variable` and `name` columns to be usable 
#' in any function. In 'Variables' the `name` column must also have unique 
#' entries, and in 'Categories' the combination of `variable` and `name` columns 
#' must also be unique.  
#' 
#' The object may be specifically formatted to be compatible with additional 
#' [Maelstrom Research software](https://maelstrom-research.org/page/software), 
#' in particular [Opal environments](https://www.obiba.org/pages/products/opal/).
#'
#' @param object A potential DataSchema object to be coerced.
#'
#' @returns
#' A list of data frame(s) named 'Variables' and (if any) 'Categories', with 
#' `Rmonize::class` 'dataschema_mlstr'.
#'
#' @examples
#' {
#'
#' # Use Rmonize_DEMO to run examples.
#' library(dplyr)
#'
#' glimpse(as_dataschema_mlstr(Rmonize_DEMO$`dataschema - final`))
#' 
#' }
#'
#' @import dplyr
#' @importFrom rlang .data
#'
#' @export
as_dataschema_mlstr <- function(object){

  object <- as_dataschema(object,as_dataschema_mlstr = TRUE)
  # object <- as_data_dict_mlstr(object)
  # 
  # # if all test pass:
  attributes(object)$`Rmonize::class` <- "dataschema_mlstr"

  return(object)
}


#' @title
#' Validate and coerce as a harmonized dossier object
#'
#' @description
#' Checks if an object is a valid harmonized dossier and returns it with the 
#' appropriate `Rmonize::class` attribute. This function mainly helps validate 
#' inputs within other functions of the package but could be used separately to 
#' ensure that an object has an appropriate structure. The function 
#' has two arguments that can optionally be declared by the user 
#' (`unique_col_dataset` and `unique_col_id`). `unique_col_dataset` refers to 
#' the columns which contains name of each harmonized dataset. `unique_col_id` 
#' refers to the column in harmonized datasets which identifies unique 
#' combinations of observation/dataset. These two columns are added to ensure 
#' that there is always a unique entity identifier when datasets are pooled.
#'
#' @details
#' A harmonized dossier is a named list containing one or more data frames, 
#' which are harmonized datasets. A harmonized dossier is generally the 
#' product of applying processing to a dossier object The name of each 
#' harmonized dataset (data frame) is taken from the reference input dataset. 
#' A harmonized dossier also contains the DataSchema and 
#' Data Processing Elements used in processing as attributes.
#' 
#' A DataSchema is the list of core variables to generate across datasets and 
#' related metadata. A DataSchema object is a list of data frames with elements 
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables' 
#' element must contain at least the `name` column, and the 'Categories' 
#' element must contain at least the `variable` and `name` columns to be usable 
#' in any function. In 'Variables' the `name` column must also have unique 
#' entries, and in 'Categories' the combination of `variable` and `name` columns 
#' must also be unique. 
#'  
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param object A A potential harmonized dossier object to be coerced.
#' @param dataschema A DataSchema object.
#' @param data_proc_elem A Data Processing Elements object.
#' @param harmonized_col_id A character string identifying the name of the 
#' column present in every dataset to use as a dataset identifier.
#' @param harmonized_col_dataset A character string identifying the column 
#' to use for dataset names.
#' @param harmonized_data_dict_apply Whether to apply the dataschema to each 
#' harmonized dataset. FALSE by default.
#'
#' @returns
#' A list of data frame(s), containing harmonized dataset(s). 
#' The DataSchema and Data Processing Elements are preserved as attributes of 
#' the output harmonized dossier.
#'
#' @examples
#' {
#' 
#' # Use Rmonize_DEMO to run examples.
#' library(dplyr)
#' 
#' glimpse(as_harmonized_dossier(Rmonize_DEMO$harmonized_dossier))
#'   
#' }
#'
#' @import dplyr fabR
#' @importFrom rlang .data
#'
#' @export
as_harmonized_dossier <- function(
    object, 
    dataschema = attributes(object)$`Rmonize::DataSchema`,
    data_proc_elem = attributes(object)$`Rmonize::Data Processing Elements`,
    harmonized_col_id = attributes(object)$`Rmonize::harmonized_col_id`,
    harmonized_col_dataset = attributes(object)$`Rmonize::harmonized_col_dataset`,
    harmonized_data_dict_apply = FALSE){

  silently_run(dossier_create(object))
  
  if(!is.logical(harmonized_data_dict_apply))
    stop(call. = FALSE,
         '`harmonized_data_dict_apply` must be TRUE or FALSE (TRUE by default)')

  # check the id column 
  if(is.null(harmonized_col_id))
    stop(message('ERROR 100'))
  #   stop(call. = FALSE,
  #        '`harmonized_col_id` must be provided')
  
  # check if col_id exists
  bind_rows(
    object %>% lapply(function(x) x %>% 
                        mutate(across(everything(),as.character)))) %>% 
    select(all_of(harmonized_col_id))
  
  # check the DataSchema
  if(is.null(dataschema)){
    dataschema <- data_dict_extract(bind_rows(as.list(object)))
    dataschema$Variables <- 
      dataschema$Variables %>%
      select(-starts_with("Mlstr_harmo::"),-starts_with("Rmonize::"))}

  dataschema <- as_dataschema(dataschema,as_dataschema_mlstr = TRUE)
  
  # check the DPE
  if(is.null(data_proc_elem)){

    ## creation of the DPE from the variables in the DataSchema
    data_proc_elem <- 
      tibble(input_dataset = names(object)) %>% 
      cross_join(tibble(
        input_variables = dataschema$Variables$name,
        valueType = dataschema$Variables$valueType)) %>%
      mutate(
        dataschema_variable = .data$`input_variables`,
        `Mlstr_harmo::rule_category` = ifelse(
          .data$`input_variables` == harmonized_col_id, 
          'id_creation','direct_mapping'),
        `Rmonize::r_script` = 
          ifelse(
            .data$`input_variables` == harmonized_col_id,
          paste0(
          .data$`input_dataset`, 
          " %>% \n  select('",.data$`input_variables`,
          "' = '",.data$`input_variables`,"')"),
          paste0(
            .data$`input_dataset`, 
            " %>% \n  mutate('",.data$`input_variables`,
            "' = `",.data$`input_variables`,"`) %>% ",
            "\n  select('",harmonized_col_id,"','",.data$`dataschema_variable`,"')")),
        `Mlstr_harmo::algorithm` = .data$`Mlstr_harmo::rule_category`)
  }
  
  data_proc_elem <- as_data_proc_elem(data_proc_elem)
  dataschema <- as_data_dict_mlstr(dataschema)
  
  # Warn user that the dataschema should have a categorical variable if
  # harmonized_col_dataset is provided.
  
  if(!is.null(harmonized_col_dataset)){

    # test if harmonized_col_dataset exists
    bind_rows(
      as.list(object) %>% lapply(function(x) x %>% 
                          mutate(across(everything(),as.character)))) %>% 
      select(all_of(harmonized_col_dataset))
    
  }
  
  # check the presence of same column names in the dataset(s)
  same_names <- 
    all(
      sapply(
        sapply(
          object %>% lapply(function(x) attributes(x)$`names`),
          `%in%`,
          dataschema$`Variables`$`name`),
        identical,
        TRUE))
  
  if(!same_names) 
    stop(call. = FALSE,
"All of your column(s) in the harmonized dataset(s) must be in the DataSchema 
name list of variables.")
  
  
  # Assignation of col_id in the datasets
  object <- 
    object %>% lapply(function(x) as_dataset(x, col_id = harmonized_col_id))
  
  if(harmonized_data_dict_apply == TRUE){
    
    message(bold("\n
- CREATION OF HARMONIZED DATA DICTIONARY : --------------------------------\n"))
    

    
    for(i in names(object)){
      # stop()}
      
      harmo_data_dict <- dataschema
      harmo_data_dict[['Variables']][['Mlstr_harmo::rule_category']] <- NULL
      harmo_data_dict[['Variables']][['Mlstr_harmo::algorithm']] <- NULL
      harmo_data_dict[['Variables']][['Rmonize::r_script']] <- NULL
      harmo_data_dict[['Variables']][['Mlstr_harmo::comment']] <- NULL
      harmo_data_dict[['Variables']][['Mlstr_harmo::status']] <- NULL
      harmo_data_dict[['Variables']][['Mlstr_harmo::status_detail']] <- NULL
      
      input_data_proc_elem <-
        data_proc_elem %>%
        rename("name" = "dataschema_variable") %>%
        dplyr::filter(.data$`input_dataset` == !! i) %>%
        select('name', 
               'Mlstr_harmo::rule_category',
               'Mlstr_harmo::algorithm',
               matches('^Rmonize::r_script$'),
               'Mlstr_harmo::status',
               'Mlstr_harmo::status_detail',
               'Mlstr_harmo::comment')
      
      harmo_data_dict[['Variables']] <-
        harmo_data_dict[['Variables']] %>%
        full_join(input_data_proc_elem, by = 'name')
      
      object[[i]] <- 
        valueType_adjust(
          from = harmo_data_dict,
          to = as_dataset(object[[i]])) %>%
        dataset_zap_data_dict() %>%
        data_dict_apply(harmo_data_dict)
      
      message(bold(i)," : done")
    }
  }
  
  # if all checks TRUE
  attributes(object)$`Rmonize::class` <- "harmonized_dossier"
  attributes(object)$`Rmonize::DataSchema` <- dataschema
  attributes(object)$`Rmonize::Data Processing Elements` <- data_proc_elem
  attributes(object)$`Rmonize::harmonized_col_id` <- harmonized_col_id
  
  if(!is.null(harmonized_col_dataset)){
    attributes(object)$`Rmonize::harmonized_col_dataset` <- harmonized_col_dataset
  }
  return(object)
}

#' @title
#' Generate a pooled harmonized dataset from a harmonized dossier
#' 
#' @description
#' Generates a pooled harmonized dataset from a harmonized dossier. The function 
#' has two arguments that can optionally be declared by the user 
#' (`unique_col_dataset` and `unique_col_id`). `unique_col_dataset` refers to 
#' the columns which contains name of each harmonized dataset. `unique_col_id` 
#' refers to the column in harmonized datasets which identifies unique 
#' combinations of observation/dataset. These two columns are added to ensure 
#' that there is always a unique entity identifier when datasets are pooled.
#' 
#' @details
#' A harmonized dossier is a named list containing one or more data frames, 
#' which are harmonized datasets. A harmonized dossier is generally the 
#' product of applying processing to a dossier object The name of each 
#' harmonized dataset (data frame) is taken from the reference input dataset. 
#' A harmonized dossier also contains the DataSchema and 
#' Data Processing Elements used in processing as attributes.
#' 
#' A DataSchema is the list of core variables to generate across datasets and 
#' related metadata. A DataSchema object is a list of data frames with elements 
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables' 
#' element must contain at least the `name` column, and the 'Categories' 
#' element must contain at least the `variable` and `name` columns to be usable 
#' in any function. In 'Variables' the `name` column must also have unique 
#' entries, and in 'Categories' the combination of `variable` and `name` columns 
#' must also be unique. 
#' 
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param harmonized_dossier A list containing the harmonized dataset(s).
#' @param harmonized_col_id A character string identifying the name of the 
#' column present in every dataset to use as a dataset identifier.
#' @param harmonized_col_dataset A character string identifying the column 
#' to use for dataset names.
#' @param add_col_dataset Whether to add an extra column to each 
#' harmonized dataset. The resulting data frame will have an additional column 
#' and its data dictionary will be updated accordingly adding categories for 
#' this variable if necessary. FALSE by default.
#' @param dataschema A DataSchema object.
#' @param data_proc_elem A Data Processing Elements object.
#'
#' @returns
#' A data frame containing the pooled harmonized dataset.
#' 
#' @examples
#' {
#'
#' # use madshapR_DEMO provided by the package
#' library(dplyr)
#' 
#' harmonized_dossier <- Rmonize_DEMO$harmonized_dossier
#'
#' glimpse(pooled_harmonized_dataset_create(
#'   harmonized_dossier,harmonized_col_id = 'adm_unique_id'))
#'   
#' }
#'
#' @import dplyr fabR
#' @importFrom rlang .data
#'
#' @export
pooled_harmonized_dataset_create <- function(
    harmonized_dossier,
    harmonized_col_dataset = attributes(harmonized_dossier)$`Rmonize::harmonized_col_dataset`,
    harmonized_col_id = attributes(harmonized_dossier)$`Rmonize::harmonized_col_id`,
    add_col_dataset = FALSE,
    dataschema = attributes(harmonized_dossier)$`Rmonize::DataSchema`,
    data_proc_elem = attributes(harmonized_dossier)$`Rmonize::Data Processing Elements`){
  
  if(!is.logical(add_col_dataset))
    stop(call. = FALSE,
         '`add_col_dataset` must be TRUE or FALSE (TRUE by default)')
  
  # check harmonized dossier (except harmonized_col_dataset)
  as_harmonized_dossier(
    object = harmonized_dossier,
    harmonized_col_id = harmonized_col_id,
    dataschema = dataschema,
    data_proc_elem = data_proc_elem)

  pooled_harmonized_dataset <- 
    bind_rows(harmonized_dossier) %>%
    data_dict_apply(dataschema)

  col_dataset <- 
    tibble(.rows = nrow(pooled_harmonized_dataset))
  
  if(is.null(harmonized_col_dataset)){
    if(add_col_dataset == TRUE){
      # if the harmonized_col_dataset is null 
      harmonized_col_dataset <- 'Rmonize::harmonized_col_dataset'
    }
  }
  
  # test whether the harmonized_col_dataset exists and is categorical (
  # NA dont exists but to add
  # TRUE exists and categorical
  # FALSE exists not categorical
  # NULL is not provided)
  
  bool_test <- 
    if(!is.null(harmonized_col_dataset)){
      toString(
        pooled_harmonized_dataset %>% reframe(
          across(any_of(harmonized_col_dataset),  is_category))) %>%
        as.logical %>% 
        as_any_boolean == FALSE}else{NULL}
  
  if(!is.null(bool_test)){
    
    if(bool_test %in% FALSE){} # nothing to warn. the col exists and is categorical
    
    if(bool_test %in% TRUE){
      warning(call. = FALSE,
'\nThe harmonized_col_dataset `',harmonized_col_dataset,'` you declared is not ',
'categorical in your DataSchema.',
'\nThe correspondant categories have been be created in your pooled harmonized dataset.',
bold("\n\nUseful tip:\n"),
'To avoid this warning, we recommend to add the categories in your DataSchema.')

  }
  
    if(is.na(bool_test) & add_col_dataset == TRUE) {
      warning(call. = FALSE,
'\nAn additional variable `',harmonized_col_dataset,'` has been created
with harmonized dataset names as values for each harmonized dataset.',
bold("\n\nUseful tip:\n"),
'To avoid this message, we recommend to set `harmonized_col_dataset` as a 
categorical variable DataSchema, or to set `add_col_dataset` = FALSE.')
    
      col_dataset <- harmonized_dossier
    
      for(i in names(col_dataset)){
        # stop()}
        col_dataset[[i]] <- 
          col_dataset[[i]] %>% 
          mutate(`Rmonize::harmonized_col_dataset` = i) %>%
          select(!!harmonized_col_dataset := 'Rmonize::harmonized_col_dataset')}
      
      col_dataset <- bind_rows(col_dataset)
      
    }
    
    if(is.na(bool_test) & add_col_dataset == FALSE) {
      stop(call. = FALSE,
'\nThe harmonized_col_dataset `',harmonized_col_dataset,'` you declared is not ',
'in your DataSchema.',
bold("\n\nUseful tip:\n"),
'To avoid this error, we recommend to set `harmonized_col_dataset` as a 
categorical variable DataSchema, or to set `add_col_dataset` = FALSE.')
    
    }
  }
  
  pooled_harmonized_dataset <-
    pooled_harmonized_dataset %>%
    bind_cols(col_dataset) %>%
    mutate(across(any_of(harmonized_col_dataset), as_category))

  attributes(pooled_harmonized_dataset)$`Rmonize::class` <- 
    "pooled_harmonized_dataset"
  
  attributes(pooled_harmonized_dataset)$`Rmonize::harmonized_col_id` <- 
    harmonized_col_id
  
  if(!is.null(harmonized_col_dataset)){
    attributes(pooled_harmonized_dataset)$`Rmonize::harmonized_col_dataset` <- 
      harmonized_col_dataset}

  return(pooled_harmonized_dataset)
}

#' @title
#' Test for a valid DataSchema object with specific format restrictions
#'
#' @description
#' Tests if an object is a valid DataSchema object with specific format 
#' restrictions for compatibility with other Maelstrom Research software. This 
#' function mainly helps validate input within other functions of the package 
#' but could be used to check if an object is valid for use in a function.
#'
#' @details
#' A DataSchema is the list of core variables to generate across datasets and 
#' related metadata. A DataSchema object is a list of data frames with elements 
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables' 
#' element must contain at least the `name` column, and the 'Categories' 
#' element must contain at least the `variable` and `name` columns to be usable 
#' in any function. In 'Variables' the `name` column must also have unique 
#' entries, and in 'Categories' the combination of `variable` and `name` columns 
#' must also be unique.
#' 
#' The object may be specifically formatted to be compatible with additional 
#' [Maelstrom Research software](https://maelstrom-research.org/page/software), 
#' in particular [Opal environments](https://www.obiba.org/pages/products/opal/).
#'
#' @seealso
#' For a better assessment, please use [dataschema_evaluate()].
#'
#' @param object A potential DataSchema object to be evaluated.
#'
#' @returns
#' A logical.
#'
#' @examples
#' {
#' 
#' # use Rmonize_DEMO provided by the package
#'
#' dataschema <- Rmonize_DEMO$`dataschema - final`
#' is_dataschema_mlstr(dataschema)
#' is_dataschema_mlstr(iris)
#'
#'}
#'
#' @import dplyr tidyr fabR
#' @importFrom rlang .data
#'
#' @export
is_dataschema_mlstr <- function(object){
  
  object <- object
  # if only the tibble is given in parameter
  test <- silently_run(try(as_dataschema_mlstr(object),silent = TRUE))
  if(class(test)[1] == 'try-error')    return(FALSE)
  return(TRUE)
}

#' @title
#' Test for a valid DataSchema object
#'
#' @description
#' Tests if the input is a valid DataSchema object. This function mainly helps 
#' validate input within other functions of the package but could be used to 
#' check if an object is valid for use in a function.
#'
#' @details
#' A DataSchema is the list of core variables to generate across datasets and 
#' related metadata. A DataSchema object is a list of data frames with elements 
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables' 
#' element must contain at least the `name` column, and the 'Categories' 
#' element must contain at least the `variable` and `name` columns to be usable 
#' in any function. In 'Variables' the `name` column must also have unique 
#' entries, and in 'Categories' the combination of `variable` and `name` columns 
#' must also be unique.
#'
#' @seealso
#' For a better assessment, please use [dataschema_evaluate()].
#'
#' @param object A potential DataSchema object to be evaluated.
#'
#' @returns
#' A logical.
#'
#' @examples
#' {
#' 
#' # use Rmonize_DEMO provided by the package
#'
#' dataschema <- Rmonize_DEMO$`dataschema - final`
#' is_dataschema(dataschema)
#' is_dataschema(iris)
#'
#'}
#'
#' @import dplyr tidyr fabR
#' @importFrom rlang .data
#'
#' @export
is_dataschema <- function(object){
  
  object <- object
  # if only the tibble is given in parameter
  test <- silently_run(try(as_dataschema(object),silent = TRUE))
  if(class(test)[1] == 'try-error')    return(FALSE)
  return(TRUE)
  
}

#' @title
#' Test for a valid Data Processing Elements object
#'
#' @description
#' Tests if the input is a valid Data Processing Elements object. This function 
#' mainly helps validate input within other functions of the package but could 
#' be used to check if an object is valid for use in a function.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input 
#' variables into harmonized variables in the DataSchema format. It is also 
#' contains metadata used to generate documentation of the processing. 
#' A Data Processing Elements object is a data frame with specific columns 
#' used in data processing: `dataschema_variable`, `input_dataset`, 
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`. 
#' To initiate processing, the first entry must be the creation of a harmonized 
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param object A potential Data Processing Elements object to be evaluated.
#'
#' @returns
#' A logical.
#'
#' @examples
#' {
#' 
#' # use Rmonize_DEMO provided by the package
#'
#' data_proc_elem <- Rmonize_DEMO$`data_processing_elements - final`
#' is_data_proc_elem(data_proc_elem)
#' is_data_proc_elem(iris)
#'
#'}
#'
#' @import dplyr tidyr fabR
#' @importFrom rlang .data
#'
#' @export
is_data_proc_elem <- function(object){
  
  object <- object
  # if only the tibble is given in parameter
  test <- silently_run(try(as_data_proc_elem(object),silent = TRUE))
  if(class(test)[1] == 'try-error')    return(FALSE)
  return(TRUE)
  
}

Try the Rmonize package in your browser

Any scripts or data that you put into this service are public.

Rmonize documentation built on May 29, 2024, 9:09 a.m.