Nothing
#' set_attributes
#'
#' set_attributes
#' @param attributes a joined table of all attribute metadata
#' @param factors a table with factor code-definition pairs; see details
#' @param col_classes optional, list of R column classes ('ordered', 'numeric', 'factor', 'Date', or 'character', case sensitive)
#' will let the function infer missing 'domain' and 'measurementScale' values for attributes column.
#' Should be in same order as attributeNames in the attributes table, or be a named list with names corresponding to attributeNames
#' in the attributes table.
#' @param missingValues optional, a table with missing value code-deinition pairs; see details
#' @details The attributes data frame must use only the recognized column
#' headers shown here. The attributes data frame must contain columns for required metadata.
#' These are:
#'
#' \strong{For all data:}
#'
#' \itemize{
#' \item attributeName (required, free text field)
#'
#' \item attributeDefinition (required, free text field)
#'
#' \item measurementScale (required, "nominal", "ordinal", "ratio", "interval", or "dateTime",
#' case sensitive) but it can be inferred from col_classes.
#'
#' \item domain (required, "numericDomain", "textDomain", "enumeratedDomain", or "dateTimeDomain",
#' case sensitive) but it can be inferred from col_classes.
#' }
#'
#' \strong{For numeric (ratio or interval) data:}
#' \itemize{
#' \item unit (required). Unitless values should use "dimensionless" as the unit.
#' }
#'
#' \strong{For character (textDomain) data:}
#' \itemize{
#' \item definition (required)
#' }
#'
#' \strong{For dateTime data:}
#' \itemize{
#' \item formatString (required)
#' }
#'
#' Other optional allowed columns in the attributes table are:
#' source, pattern, precision, numberType, missingValueCode, missingValueCodeExplanation,
#' attributeLabel, storageType, minimum, maximum
#'
#' The \strong{factors} data frame, required for attributes in an enumerated domain, must use only the
#' following recognized column headers:
#' \itemize{
#' \item attributeName (required)
#' \item code (required)
#' \item definition (required)
#' }
#'
#' The \strong{missingValues} data frame, optional, can be used in the case that multiple missing value codes
#' need to be set for the same attribute. This table must contain the following recognized column
#' headers.
#' \itemize{
#' \item attributeName (required)
#' \item code (required)
#' \item definition (required)
#' }
#'
#' @return an eml "attributeList" object
#' @export
set_attributes <-
function(attributes,
factors = NULL,
col_classes = NULL,
missingValues = NULL) {
## convert factors to data.frame because it could be a tibble
## or tbl_df
factors <- as.data.frame(factors)
missingValues <- as.data.frame(missingValues)
## all as characters please (no stringsAsFactors!)
attributes[] <- lapply(attributes, as.character)
factors[] <- lapply(factors, as.character)
missingValues[] <- lapply(missingValues, as.character)
## check attributes data.frame.
## must declare required columns: attributeName, attributeDescription
## infer "domain" & "measurementScale" given optional column classes
attributes <-
check_and_complete_attributes(attributes, col_classes)
# check factors
if (nrow(factors) != 0) {
check_codeDefinitions(factors, type = "factors")
}
# check missingValues
if (nrow(missingValues) != 0) {
check_codeDefinitions(missingValues, type = "missingValues")
}
## Add NA columns if necessary FIXME some of these can
## be missing if their class isn't represented, but otherwise
## must be present
for (x in c(
"precision",
"minimum",
"maximum",
"unit",
"numberType",
"formatString",
"definition",
"pattern",
"source",
"attributeLabel",
"storageType",
"missingValueCode",
"missingValueCodeExplanation",
"valueURI",
"valueLabel",
"propertyURI",
"propertyLabel"
)) {
attributes <- add_na_column(x, attributes)
}
out <- list()
out$attribute <-
lapply(1:dim(attributes)[1], function(i)
set_attribute(attributes[i, ], factors = factors, missingValues = missingValues))
as_emld(out)
}
set_attribute <- function(row, factors = NULL, missingValues = NULL) {
s <- row[["measurementScale"]]
if (s %in% c("ratio", "interval")) {
if (!is_standardUnit(row[["unit"]])) {
type <- "customUnit"
warning(
"Unit '",
row[["unit"]],
"' is not a recognized standard unit; treating as custom unit. ",
"Please be sure you also define a custom unit in your EML record, ",
"or replace with a recognized standard unit. ",
if(is.na(row[["unit"]])){
'For unitless values, use "dimensionless" as the unit. '
} ,
"See set_unitList() for details."
)
} else {
type <- "standardUnit"
}
u <- setNames(list(list()), type)
u[[type]] <- row[["unit"]]
node <- list(
unit = u,
precision = row[["precision"]],
numericDomain = list(
numberType = row[["numberType"]],
bounds = set_BoundsGroup(row)
)
)
}
if (s %in% c("ordinal", "nominal")) {
node <- list(nonNumericDomain = list())
if (row[["domain"]] == "textDomain") {
n <- list(
definition = row[["definition"]],
source = row[["source"]],
pattern = row[["pattern"]]
)
node$nonNumericDomain$textDomain <- n
} else if (row[["domain"]] == "enumeratedDomain") {
node$nonNumericDomain$enumeratedDomain <-
set_codeDefinitions(row, code_set = factors, type = "factors")
}
}
if (s %in% c("dateTime")) {
if (is.na(row[["formatString"]])) {
warning(paste0(
"The required formatString is missing for attribute ",
row[["attributeName"]]
))
}
node <- list(
formatString = row[["formatString"]],
dateTimePrecision = row[["precision"]],
dateTimeDomain = list(bounds = set_BoundsGroup(row))
)
}
measurementScale <- setNames(list(list()), s)
measurementScale[[s]] <- node
missingValueCode <- NULL
if (!is.na(row[["missingValueCode"]]) & !(row[["attributeName"]] %in% unique(missingValues$attributeName))) {
missingValueCode <-
list(
code = na2empty(row[["missingValueCode"]]),
codeExplanation = na2empty(row[["missingValueCodeExplanation"]])
)
}
else if (is.na(row[["missingValueCode"]]) & row[["attributeName"]] %in% unique(missingValues$attributeName)){
missingValueCode <- set_codeDefinitions(row, code_set = missingValues, type = "missingValues")
}
else if (!is.na(row[["missingValueCode"]]) & row[["attributeName"]] %in% unique(missingValues$attributeName)){
warning(
paste0("The attribute '",
row[["attributeName"]],
"' has missing value codes set in both the 'attributes' and 'missingValues' data.frames.
Using codes from 'missingValues' data.frame."
)
)
missingValueCode <- set_codeDefinitions(row, code_set = missingValues, type = "missingValues")
}
annotation <- NULL
if (!is.na(row[["valueURI"]]) & !is.na(row[["valueLabel"]]) & !is.na(row[["propertyURI"]]) & !is.na(row[["propertyLabel"]])) {
annotation <-
list(
propertyURI = list(label = row[["propertyLabel"]], propertyURI = row[["propertyURI"]]),
valueURI = list(label = row[["valueLabel"]], valueURI = row[["valueURI"]])
)
}
# Having NA for id is invalid, instead explicitly set to NULL
id <- NULL
if(!is.null(row[["id"]]) && !is.na(row[["id"]])) {
id <- row[["id"]]
}
list(
id = id,
attributeName = row[["attributeName"]],
attributeDefinition = row[["attributeDefinition"]],
attributeLabel = row[["attributeLabel"]],
storageType = row[["storageType"]],
missingValueCode = missingValueCode,
measurementScale = measurementScale,
annotation = annotation
)
}
set_codeDefinitions <- function(row, code_set, type) {
name <- row[["attributeName"]]
df <- code_set[code_set$attributeName == name, ]
if (type == "factors"){
ListOfcodeDefinition <- lapply(1:dim(df)[1], function(i) {
list(
code = df[i, "code"],
definition = df[i, "definition"]
)
})
list(codeDefinition = ListOfcodeDefinition)
}
else if (type == "missingValues"){
if (nrow(df) > 0){
ListOfcodeDefinition <- lapply(1:dim(df)[1], function(i) {
list(
code = df[i, "code"],
codeExplanation = df[i, "definition"]
)
})
}
}
}
set_BoundsGroup <- function(row) {
if (!is.na(row[["minimum"]])) {
minimum <- list(na2empty(row[["minimum"]]),
"exclusive" = "false"
)
} else {
minimum <- NULL
}
if (!is.na(row[["maximum"]])) {
maximum <- list(na2empty(row[["maximum"]]),
"exclusive" = "false"
)
} else {
maximum <- NULL
}
list(minimum = minimum, maximum = maximum)
}
infer_domain_scale <-
function(col_classes,
attributeName = names(col_classes),
attributes) {
if (length(col_classes) != nrow(attributes)) {
if (is.null(names(col_classes))) {
stop(
call. = FALSE,
"If col_classes is not NULL, it must have
as many elements as there are rows in attributes unless they are named."
)
}
}
if (!is.null(names(col_classes))) {
if (!(all(names(col_classes) %in% attributeName))) {
stop(
call. = FALSE,
"If col_classes is a named list, it should have names
corresponding to attributeName."
)
}
}
if (!(all(
col_classes[!is.na(col_classes)] %in%
c("numeric", "character", "factor", "Date", "ordered")
))) {
stop(
call. = FALSE,
"All non missing col_classes values have to
be 'ordered', 'numeric', 'character', 'factor' or 'Date'."
)
}
domain <- col_classes
measurementScale <- col_classes
storageType <- col_classes
domain[col_classes == "numeric"] <- "numericDomain"
domain[col_classes == "character"] <- "textDomain"
domain[col_classes %in% c("factor", "ordered")] <-
"enumeratedDomain"
domain[col_classes %in% c("Date")] <- "dateTimeDomain"
# compare domain with domain given in attributes if there is one
if ("domain" %in% names(attributes)) {
if (!is.null(names(col_classes))) {
if (any(domain !=
attributes$domain[
attributes$attributeName == names(col_classes)
])) {
whichNot <-
names(col_classes)[which(domain !=
attributes$domain[
attributes$attributeName ==
names(col_classes)
])]
stop(
call. = FALSE,
paste0(
"For the attribute ",
whichNot,
" the domain value inferred from col_classes
does not agree with the domain value existing
in attributes. Check col_classes and the domain
column you provided.\n"
)
)
}
} else {
if (any(domain != attributes$domain)) {
whichNot <-
attributes$attributeName[which(domain != attributes$domain)]
stop(
call. = FALSE,
paste0(
"For the attribute ",
whichNot,
" the domain value inferred from col_classes
does not agree with the domain value existing
in attributes. Check col_classes and the domain column
you provided.\n"
)
)
}
}
}
# Map "numeric" cols to "ratio" by default
measurementScale[col_classes == "numeric"] <- "ratio"
# But trust the user if they specify "interval"
if ("measurementScale" %in% names(attributes)) {
measurementScale[
col_classes == "numeric" &
attributes$measurementScale == "interval"] <- "interval"
}
measurementScale[col_classes == "character"] <- "nominal"
measurementScale[col_classes == "ordered"] <- "ordinal"
measurementScale[col_classes == "factor"] <- "nominal"
measurementScale[col_classes %in% c("Date")] <- "dateTime"
# compare measurementScale with measurementScale
# given in attributes if there is one
if ("measurementScale" %in% names(attributes)) {
if (!is.null(names(col_classes))) {
if (any(measurementScale !=
attributes$measurementScale[
attributes$attributeName ==
names(col_classes)
])) {
whichNot <-
names(col_classes)[
which(measurementScale !=
attributes$measurementScale[
attributes$attributeName ==
names(col_classes)
])
]
stop(
call. = FALSE,
paste0(
"For the attribute ",
whichNot,
" the measurementScale value
inferred from col_classes does not
agree with the measurementScale value
existing in attributes. Check col_classes
and the measurementScale column you provided.\n"
)
)
}
} else {
if (any(measurementScale != attributes$measurementScale)) {
whichNot <-
attributes$attributeName[
which(measurementScale != attributes$measurementScale)
]
stop(
call. = FALSE,
paste0(
"For the attribute ",
whichNot,
" the measurementScale value inferred from col_classes
does not agree with the measurementScale value existing
in attributes. Check col_classes and the measurementScale
column you provided.\n"
)
)
}
}
}
## storage type is optional, maybe better not to set this?
# Map "numeric" cols to "float" by default
storageType[col_classes == "numeric"] <- "float"
# But trust the user if they specify "integer"
if ("storageType" %in% names(attributes)) {
storageType[
col_classes == "numeric" &
attributes$storageType == "integer"] <- "integer"
}
storageType[col_classes == "character"] <- "string"
storageType[col_classes %in% c("factor", "ordered")] <- "string"
storageType[col_classes %in% c("Date")] <- "date"
# compare storageType with storageType given in attributes if there is one
if ("storageType" %in% names(attributes)) {
if (!is.null(names(col_classes))) {
if (any(storageType != attributes$storageType[
attributes$attributeName == names(col_classes)
])) {
whichNot <-
names(col_classes)[
which(storageType != attributes$storageType[
attributes$attributeName == names(col_classes)
])
]
stop(
call. = FALSE,
paste0(
"For the attribute ",
whichNot,
" the storageType value inferred from col_classes
does not agree with the storageType value existing in attributes.
Check col_classes and the storageType column you provided.\n"
)
)
}
} else {
if (any(storageType != attributes$storageType)) {
whichNot <-
attributes$attributeName[
which(storageType != attributes$storageType)
]
stop(
call. = FALSE,
paste0(
"For the attribute ",
whichNot,
" the storageType value inferred from col_classes
does not agree with the storageType value existing
in attributes. Check col_classes and the storageType
column you provided.\n"
)
)
}
}
}
data.frame(
attributeName = attributeName,
domain = domain,
measurementScale = measurementScale,
storageType = storageType,
stringsAsFactors = FALSE
)
}
add_na_column <- function(column, df) {
if (!column %in% names(df)) {
df[[column]] <- as.character(NA)
}
df
}
na2empty <- function(x) {
if (!is.null(x)) {
if (is.na(x)) {
x <- character()
} else if (is.numeric(x)) {
x <- as.character(x)
}
}
x
}
check_and_complete_attributes <- function(attributes, col_classes) {
if (!"attributeName" %in% names(attributes)) {
stop(
call. = FALSE,
"attributes table must include an 'attributeName' column"
)
} else {
if (any(is.na(attributes$attributeName))) {
stop(
call. = FALSE,
"The attributeName column must be filled for each attribute."
)
}
}
attribute_names <- c("source",
"pattern",
"measurementScale",
"unit",
"precision",
"numberType",
"domain",
"definition",
"formatString",
"missingValueCode",
"missingValueCodeExplanation",
"attributeName",
"attributeDefinition",
"attributeLabel",
"storageType",
"minimum",
"maximum",
"id",
"propertyLabel",
"propertyURI",
"valueLabel",
"valueURI")
if (any(!names(attributes) %in% attribute_names)) {
stop(
call. = FALSE,
paste0("The column names '",
paste(names(attributes)[which(!(names(attributes) %in% attribute_names))], collapse = ", "),
"' in the attributes table are not recognized.")
)
}
## infer "domain" & "measurementScale" given optional column classes
if (!is.null(col_classes)) {
attributes <-
merge(
attributes,
infer_domain_scale(
col_classes, attributes$attributeName,
attributes
),
all = TRUE,
sort = FALSE
)
}
if (!"attributeDefinition" %in% names(attributes)) {
stop(
call. = FALSE,
"attributes table must include an 'attributeDefinition' column"
)
} else {
if (any(is.na(attributes$attributeDefinition))) {
stop(
call. = FALSE,
"The attributeDefinition column must be filled for each attribute."
)
}
}
if (!"measurementScale" %in% names(attributes)) {
stop(
call. = FALSE,
"attributes table must include an 'measurementScale'
column, or you need to input 'col_classes'."
)
} else {
if (any(is.na(attributes$measurementScale))) {
stop(
call. = FALSE,
"The measurementScale column must be filled for each attribute."
)
} else {
if (!(all(
attributes$measurementScale %in% c(
"nominal", "ordinal", "ratio",
"interval", "dateTime"
)
))) {
stop(
call. = FALSE,
"measurementScale permitted values are 'nominal',
'ordinal', 'ratio', 'interval', 'dateTime'."
)
}
}
}
if (!"domain" %in% names(attributes)) {
stop(
call. = FALSE,
"attributes table must include an 'domain' column,
or you need to input 'col_classes'."
)
} else {
if (any(is.na(attributes$domain))) {
stop(
call. = FALSE,
"The domain column must be filled for each attribute."
)
} else {
if (!(all(
attributes$domain %in% c(
"numericDomain",
"textDomain",
"enumeratedDomain",
"dateTimeDomain"
)
))) {
stop(
call. = FALSE,
"domain permitted values are 'numericDomain', 'textDomain',
'enumeratedDomain', 'dateTimeDomain'."
)
}
}
}
# Check that measurementScale and domain values make valid combinations
if ("measurementScale" %in% names(attributes) &&
"domain" %in% names(attributes)) {
for (i in seq_len(nrow(attributes))) {
mscale <- attributes[i, "measurementScale"]
domain <- attributes[i, "domain"]
if (mscale %in% c("nominal", "ordinal") &&
!(domain %in% c("enumeratedDomain", "textDomain"))) {
stop(
call. = FALSE,
paste0(
"The attribute in row ",
i,
" has an invalid combination of measurementScale (",
mscale,
") and domain (",
domain,
"). For a measurementScale of '",
mscale,
"', domain must be either 'enumeratedDomain' or 'textDomain'."
)
)
} else if (mscale %in% c("interval", "ratio") &&
domain != "numericDomain") {
stop(
call. = FALSE,
paste0(
"The attribute in row ",
i,
" has an invalid combination of measurementScale (",
mscale,
") and domain (",
domain,
"). For a measurementScale of '",
mscale,
"', domain must be 'numericDomain'."
)
)
} else if (mscale == "dateTime" &&
!is.null(domain) && domain != "dateTimeDomain") {
stop(
call. = FALSE,
paste0(
"The attribute in row ",
i,
" has an invalid combination of measurementScale (",
mscale,
") and domain (",
domain,
"). For a measurementScale of '",
mscale,
"', domain must be 'dateTimeDomain'."
)
)
}
}
}
return(attributes)
}
# number of codes by attributeName in factors
count_levels <- function(attributeName, factors) {
factors <- factors[factors$attributeName == attributeName, ]
length(unique(factors$code))
}
# number of lines by attributeName in factors
count_lines <- function(attributeName, factors) {
factors <- factors[factors$attributeName == attributeName, ]
nrow(factors)
}
# check the names of factors
# check that for each attributeName codes are unique
check_codeDefinitions <- function(code_set, type) {
if (!all(c("attributeName", "code", "definition") %in% names(code_set))) {
stop(
paste0("The ", type, " data.frame should have
variables called attributeName, code and definition."
),
call. = FALSE
)
}
lines_no <- vapply(unique(code_set$attributeName),
count_lines,
factors = code_set, 1
)
levels_no <- vapply(unique(code_set$attributeName),
count_levels,
factors = code_set, 1
)
forcheck <- data.frame(
lines_no = lines_no,
levels_no = levels_no,
attributeName = unique(code_set$attributeName)
)
notequal <- forcheck[forcheck$lines_no != forcheck$levels_no, ]
if (nrow(notequal) != 0) {
stop(
paste(paste0("There are attributeName(s) in ", type, " with duplicate codes:"
),
notequal$attributeName
),
call. = FALSE
)
}
}
#' is_standardUnit
#'
#' @param x name of unit to check
#'
#' @return TRUE if unit is exact match to the id of
#' a unit in the Standard Units Table, FALSE otherwise.
#' @export
#'
#' @examples
#' is_standardUnit("amperePerMeter") # TRUE
#' is_standardUnit("speciesPerSquareMeter") # FALSE
is_standardUnit <- function(x) {
## standard_unit_list <- read.csv(
## system.file("units/standard_unit_list.csv", package = "EML"))
standard_unit_list <- standardUnits$units$id
(x %in% standard_unit_list)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.