R/exportVariablesDescription.R

Defines functions .extract_question_description .clean_question_names exportVariablesDescription

Documented in exportVariablesDescription

#' Generate descriptions for trial variables
#' 
#' Extracts metadata for variables found in tricot block data and ranking data,
#' combining question descriptors from the registry and assessments. This function
#' harmonizes variable names and assigns relevant descriptions, value types, and
#' placeholders for ontology and controlled vocabularies.
#' 
#' @family export functions
#' @param x An object of class \code{CM_list} containing raw ClimMob trial data.
#' @param tricot_ranks A data frame of tricot rankings generated by \code{exportTricotRanks()}.
#' @param measured_traits A data frame of plot data generated by \code{exportMeasuredTraits()}.
#' @param block_data A data frame of block-level data generated by \code{exportBlockData()}.
#' @return A data frame describing all trial variables, including name, description,
#' value type, and optional fields for ontology and controlled vocabularies.
#' @importFrom stats na.omit
#' @export
exportVariablesDescription = function(x, tricot_ranks, measured_traits, block_data) {
  
  r = tricot_ranks
  
  b = block_data
  
  m = measured_traits
  
  questions = .extract_question_description(x)
  
  # everything with the tag "char_" is type rank
  questions$odktype = ifelse(grepl("char_", questions$name), "rank", questions$odktype)
  
  questions$odktype = ifelse(grepl("clm_end|clm_start", questions$name), "datetime", questions$odktype)
  
  questions$name = .clean_question_names(questions$name)
  
  questions$desc = gsub("- Option A", "", questions$desc)
  
  keep = !duplicated(questions$name)
  
  questions = questions[keep, ]
  
  vars = unique(c(names(b), r$trait, m$trait))
  
  vars = data.frame(variable_name = vars,
                    description = NA,
                    ontology_id = NA,
                    value_type = NA,
                    controlled_vocabulary = NA)
  
  for(i in seq_along(questions$name)) {
    index = grep(paste0(questions$name[i], "$"), vars$variable_name)
    vars[index, "description"] = questions$desc[i]
    vars[index, "value_type"] = questions$odktype[i]
  }
  
  blockdesc = c("The id of trial block, which is the trial id + tricot package",
                "text")
  
  plotdesc = paste("The name of the variety in plot", LETTERS[1:3])
  
  vars[vars$variable_name == "block_id", c(2, 4)] =  blockdesc
  
  vars[vars$variable_name == "package_item_A", c(2, 4)] =  c(plotdesc[1], "text")
  
  vars[vars$variable_name == "package_item_B", c(2, 4)] =  c(plotdesc[2], "text")
  
  vars[vars$variable_name == "package_item_C", c(2, 4)] =  c(plotdesc[3], "text")
  
  # run over the questions type "select one" and "select all that apply" to get 
  # the controlled vocabulary options
  choices = vars$variable_name[grep("select one|select all that apply", vars$value_type)]
  choices = choices[choices %in% names(b)]
  
  for(i in seq_along(choices)){
    index = grep(paste0(choices[i], "$"), vars$variable_name)
    values = unique(stats::na.omit(trimws(unlist(strsplit(as.character(b[[choices[i]]]), ";\\s*")))))
    values = values[values != ""]
    values = paste(sort(values), collapse = "|")
    vars[index, "controlled_vocabulary"] = values
  }
  
  # add controlled vocabulary for ranks
  vars$controlled_vocabulary = ifelse(vars$value_type == "rank",
                                      paste(1:3, collapse = "|"),
                                      vars$controlled_vocabulary)
  
  vars[is.na(vars)] = "No information provided"
  
  class(vars) = union( "CM_df", class(vars))
  
  return(vars)
  
}

#' Clean question names
#' @noRd
.clean_question_names = function(names_vec) {
  names_vec = gsub("_char|char_|_pos$|_neg$", "", names_vec)
  names_vec = sub("^_", "", names_vec)
  names_vec = gsub("perf_overallper_1", "item_A_vs_local", names_vec)
  names_vec = gsub("perf_overallper_2", "item_B_vs_local", names_vec)
  names_vec = gsub("perf_overallper_3", "item_C_vs_local", names_vec)
  names_vec = gsub("overallpos|overallneg", "overall", names_vec)
  names_vec = gsub("clm_start", "survey_start", names_vec)
  names_vec = gsub("clm_end", "survey_end", names_vec)
  names_vec = gsub("gender1$", "gender", names_vec)
  names_vec = gsub("_(a|b|c)$", "", names_vec)
  return(names_vec)
}

#' Extract questions description 
#' @noRd
.extract_question_description = function(x){
  # get question descriptors
  questions = as.data.frame(x$registry$fields)
  
  q2 = lapply(x$assessments$fields, function(a) {
    if (is.data.frame(a)) return(a)
    NULL
  })
  
  q2 = do.call("rbind", q2[!sapply(q2, is.null)])
  
  questions = rbind(questions, q2)
  
  return(questions)
  
}

Try the ClimMobTools package in your browser

Any scripts or data that you put into this service are public.

ClimMobTools documentation built on Nov. 15, 2025, 5:06 p.m.