R/qa_package.R

Defines functions qa_access qa_creator_ORCIDs match_reference_to_attributeList qa_attributes netcdf_to_dataframe dl_and_read_data dl_and_read_all_data qa_package

Documented in qa_attributes qa_package

#' Check package including congruence of attributes and data
#'
#' This function checks that the attributes listed in the metadata match the values in the data for each
#' tabular data object. It may also optionally check if all creators have ORCIDs and have full access
#' to all elements of the data package.
#'
#' @param mn (MNode) The Member Node to query.
#' @param resource_map_pid (character) The PID for a resource map.
#' @param read_all_data (logical) Read all data from remote and check that column types match attributes. If `FALSE`,
#'   only read first 10 rows. Only applicable to public packages (private packages will read complete dataset).
#'   If `check_attributes = FALSE`, no rows will be read.
#' @param check_attributes (logical) Check congruence of attributes and data.
#' @param check_creators (logical) Check if each creator has an ORCID. Will also run if `check_access = TRUE`.
#' @param check_access (logical) Check if each creator has full access to the metadata, resource map, and data objects.
#'   Will not run if the checks associated with `check_creators` fail.
#'
#' @return `NULL`
#'
#' @import arcticdatautils
#' @import dataone
#' @import EML
#' @importFrom crayon green red yellow
#' @importFrom datapack hasAccessRule
#' @importFrom methods is slot
#' @importFrom ncdf4 nc_open ncvar_get
#' @importFrom readxl read_excel
#' @importFrom sf read_sf st_set_geometry
#' @importFrom utils read.csv read.delim read.table download.file
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Run all QA checks
#'
#' qa_package(mn, pid, read_all_data = TRUE, check_attributes = TRUE,
#'            check_creators = TRUE, check_access = TRUE)
#' }
qa_package <- function(mn, resource_map_pid, read_all_data = TRUE, check_attributes = TRUE, check_creators = FALSE, check_access = FALSE) {
    stopifnot(class(mn) %in% c("MNode", "CNode"))
    stopifnot(is.character(resource_map_pid), nchar(resource_map_pid) > 0)
    stopifnot(is.logical(read_all_data))
    stopifnot(is.logical(check_attributes))
    stopifnot(is.logical(check_creators))
    stopifnot(is.logical(check_access))

    package <- tryCatch(suppressWarnings(arcticdatautils::get_package(mn, resource_map_pid, file_names = TRUE)),
                        error = function(e) stop("\nFailed to get package. Is the Member Node correct? Is your DataONE token set?"))

    cat(crayon::green(paste0("\n.....Processing package ", package$resource_map, "...............")))

    doc <- EML::read_eml(dataone::getObject(mn, package$metadata))

    # Check creators
    if (check_creators || check_access) {
        creator_ORCIDs <- qa_creator_ORCIDs(doc)
    }

    # Check access
    if (check_access && length(creator_ORCIDs) > 0) {
        # Check metadata
        sysmeta <- dataone::getSystemMetadata(mn, package$metadata)
        qa_access(sysmeta, creator_ORCIDs)
        # Check resource_map
        sysmeta <- dataone::getSystemMetadata(mn, package$resource_map)
        qa_access(sysmeta, creator_ORCIDs)
        # Check data objects
        for (object in package$data) {
            sysmeta <- dataone::getSystemMetadata(mn, object)
            qa_access(sysmeta, creator_ORCIDs)
        }
    }

    if (!is.null(names(doc$dataset$dataTable)) & length(doc$dataset$dataTable) > 0) {
        doc$dataset$dataTable <- list(doc$dataset$dataTable)
    }
    if (!is.null(names(doc$dataset$otherEntity)) & length(doc$dataset$otherEntity) > 0) {
        doc$dataset$otherEntity <- list(doc$dataset$otherEntity)
    }
    if (!is.null(names(doc$dataset$spatialVector)) & length(doc$dataset$spatialVector) > 0) {
        doc$dataset$spatialVector <- list(doc$dataset$spatialVector)
    }

    if (is.null(eml_get_simple(doc$dataset, "dataTable"))) doc$dataset$dataTable <- list()
    if (is.null(eml_get_simple(doc$dataset, "otherEntity"))) doc$dataset$otherEntity <- list()
    if (is.null(eml_get_simple(doc$dataset, "spatialVector"))) doc$dataset$spatialVector <- list()


    eml_objects <- c(doc$dataset$dataTable, doc$dataset$otherEntity, doc$dataset$spatialVector)

    if (length(eml_objects) == 0) {
        cat(crayon::red("\nNo data objects of a supported format were found in the EML."))
        cat(crayon::green(paste0("\n\n.....Processing complete for package ",
                                 package$resource_map, "...............")))
        return()
    }

    # Preserve order of getting data objects based on data type for correct name assignment
    # Entity names may not match data object names, so use objectName to ensure matches with data names
    names(eml_objects) <- c(arcticdatautils::eml_get_simple(doc$dataset$dataTable, "objectName"),
                                   arcticdatautils::eml_get_simple(doc$dataset$otherEntity, "objectName"),
                                   arcticdatautils::eml_get_simple(doc$dataset$spatialVector, "objectName"))
    # If object names are missing, use entity names instead
    if (is.null(names(eml_objects)) || any(is.na(names(eml_objects)))) {
        names(eml_objects) <-c(arcticdatautils::eml_get_simple(doc$dataset$dataTable, "entityName"),
                                       arcticdatautils::eml_get_simple(doc$dataset$otherEntity, "entityName"),
                                       arcticdatautils::eml_get_simple(doc$dataset$spatialVector, "entityName"))
    }

    data_objects <- dl_and_read_all_data(mn, package, doc, read_all_data)

    # If missing fileName, assign name to data objects
    for (i in seq_along(data_objects)) {
        if (is.na(names(data_objects)[[i]])) {
            id <- package$data[[i]]
            urls <- unique(arcticdatautils::eml_get_simple(eml_objects, "url"), recursive = TRUE) %>%
                grep("http", ., value = T)
            j <- which(stringr::str_detect(urls, id))
            names(data_objects)[[i]] <-
                if (!is.na(EML::eml_get(eml_objects[[j]], "objectName"))) {
                    EML::eml_get(eml_objects[[j]], "objectName")
                } else {
                    EML::eml_get(eml_objects[[j]], "entityName")
                }
        }
    }
    # what does this even do? this seems wrong
    # if (length(eml_objects) != length(data_objects)) {
    #     cat(crayon::red("\nThe number of downloaded data objects does not match the number of EML data objects."))
    #     cat(crayon::green(paste0("\n\n.....Processing complete for package ",
    #                              package$resource_map, "...............")))
    #     return()
    # }

    #eml_objects <- list(doc$dataset$dataTable, doc$dataset$otherEntity, doc$dataset$spatialVector)

    # Filter out data objects that have SKIP status
    data_objects <- Filter(function(x) suppressWarnings(length(x$status) == 0) || suppressWarnings(x$status != "SKIP"), data_objects)
    eml_objects <- Filter(function(x) EML::eml_get(x, "objectName") %in% names(data_objects) ||
                              EML::eml_get(x, "entityName") %in% names(data_objects), eml_objects)

    # Index objects in parallel based on names (in ascending order) for correct processing in iterations
    eml_objects <- eml_objects[order(names(eml_objects))]
    data_objects <- data_objects[order(names(data_objects))]

    if (check_attributes) mapply(qa_attributes, eml_objects, data_objects, MoreArgs = list(doc = doc))

    cat(crayon::green(paste0("\n\n.....Processing complete for package ",
                             package$resource_map, "...............")))
}


# Helper function for downloading and reading all data objects in a data package
dl_and_read_all_data <- function(mn, package, doc, read_all_data) {
    stopifnot(class(mn) %in% c("MNode", "CNode"))
    stopifnot(is.list(package), length(package) > 0)
    stopifnot(methods::is(doc, "emld"))
    stopifnot(is.logical(read_all_data))

    urls <- unique(arcticdatautils::eml_get_simple(doc$dataset, "url"), recursive = TRUE) %>%
        grep("http", ., value = T)

    # Check that each data object has a matching URL in the EML
    wrong_URL <- FALSE
    for (datapid in package$data) {
        n <- which(grepl(paste0(datapid, "$"), urls))
        if (length(n) != 1) {
            cat(crayon::red(paste("\nThe distribution URL for object", datapid, "is missing or incongruent in the physical section of the EML.\n")))
            wrong_URL <- TRUE
        }
    }

    if (length(urls) != length(package$data) || wrong_URL) {
        # Stop here to ensure proper ordering in the following iterations
        stop("\nAll distribution URLs for data objects must match the data PIDs to continue.")
    }

    if (read_all_data) {
        rows_to_read <- -1
    } else {
        rows_to_read <- 10
    }

    objects <- lapply(package$data, dl_and_read_data, doc, mn, rows_to_read)

    return(objects)
}


# Helper function for downloading and reading a data object
dl_and_read_data <- function(objectpid, doc, mn, rows_to_read) {
    supported_file_formats <- c("text/csv",
                                "text/tsv",
                                "text/plain",
                                "application/vnd.ms-excel",
                                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                                "application/zip",
                                "netCDF-4",
                                "netCDF-3",
                                "CF-1.4", "CF-1.3", "CF-1.2", "CF-1.1", "CF-1.0")

    if (!is.null(names(doc$dataset$dataTable)) & length(doc$dataset$dataTable) > 0) {
        doc$dataset$dataTable <- list(doc$dataset$dataTable)
    }
    if (!is.null(names(doc$dataset$otherEntity)) & length(doc$dataset$otherEntity) > 0) {
        doc$dataset$otherEntity <- list(doc$dataset$otherEntity)
    }
    if (!is.null(names(doc$dataset$spatialVector)) & length(doc$dataset$spatialVector) > 0) {
        doc$dataset$spatialVector <- list(doc$dataset$spatialVector)
    }

    if (!is.null(doc$dataset$dataTable)){
        urls_dataTable <- unique(arcticdatautils::eml_get_simple(doc$dataset$dataTable, "url"), recursive = TRUE) %>%
            grep("http", ., value = T)
        n_dT <- which(grepl(paste0(objectpid, "$"), urls_dataTable))
    } else n_dt <- list()
    if (!is.null(doc$dataset$otherEntity)){
        urls_otherEntity <- unique(arcticdatautils::eml_get_simple(doc$dataset$otherEntity, "url"), recursive = TRUE) %>%
            grep("http", ., value = T)
        n_oE <- which(grepl(paste0(objectpid, "$"), urls_otherEntity))
    } else n_oE <- list()
    if (!is.null(doc$dataset$spatialVector)){
        urls_spatialVector <- unique(arcticdatautils::eml_get_simple(doc$dataset$spatialVector, "url"), recursive = TRUE) %>%
            grep("http", ., value = T)
        n_sV <- which(grepl(paste0(objectpid, "$"), urls_spatialVector))
    } else n_sV <- list()


    if (length(n_dT) == 1) {
        entity <- doc$dataset$dataTable[[n_dT]]
        urls <- urls_dataTable
        i <- n_dT
    } else if (length(n_oE) == 1) {
        entity <- doc$dataset$otherEntity[[n_oE]]
        urls <- urls_otherEntity
        i <- n_oE
    } else if (length(n_sV) == 1) {
        entity <- doc$dataset$spatialVector[[n_sV]]
        urls <- urls_spatialVector
        i <- n_sV
    } else {
        cat(crayon::yellow("\nData object is not tabular or not a supported format. Skipped."))
        cat(crayon::green(paste0("\n..........Download complete for object ", objectpid,
                                 " (", entity$physical$objectName, ")...............")))
        return(list(status = "SKIP"))
    }

    cat(crayon::green(paste0("\n\n..........Downloading object ", objectpid,
                             " (", entity$physical$objectName, ")...............")))

    # If object is not tabular data, skip to next object
    format <- arcticdatautils::eml_get_simple(entity, "formatName")
    if (length(format) == 0) {
        cat(crayon::red("\nData object has no given format ID in EML. Unable to check if supported format. Skipped"))
        cat(crayon::green("\n..........Object not downloaded.............................."))
        return(list(status = "SKIP"))
    } else if (!(format %in% supported_file_formats)) {
        cat(crayon::red("\nData object is not tabular or not a supported format. Skipped."))
        cat(crayon::green("\n..........Object not downloaded.............................."))
        return(list(status = "SKIP"))
    }

    if (is.null(EML::get_attributes(entity$attributeList)$attributes) && length(slot(entity$attributeList, 'references')) == 0) {
        cat(crayon::red(paste0("\nEmpty attribute table for data object. Skipped.")))
        cat(crayon::green("\n..........Object not downloaded.............................."))
        return(list(status = "SKIP"))
    }

    # If package is public, read directly from the file; otherwise, use DataONE API
    sysmeta <- dataone::getSystemMetadata(mn, objectpid)
    isPublic <- datapack::hasAccessRule(sysmeta, "public", "read")

    tryCatch({
        if (isPublic) {
            if (format == "text/csv") {
                data <- utils::read.csv(urls[i], nrows = rows_to_read, check.names = FALSE, stringsAsFactors = FALSE)
            } else if (format == "text/tsv") {
                data <- utils::read.delim(urls[i], nrows = rows_to_read)
            } else if (format == "text/plain") {
                data <- utils::read.table(urls[i], nrows = rows_to_read)
            } else if (format == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || format == "application/vnd.ms-excel") {
                tmp <- tempfile()
                utils::download.file(url = urls[i], destfile = tmp, mode = "wb", quiet = TRUE)
                data <- readxl::read_excel(tmp, n_max = if (rows_to_read == -1) Inf else rows_to_read)
                unlink(tmp)
                data
            } else if (format == "application/zip") {
                # Many formats can exist within a .zip file; skip if not spatial data
                tmp <- tempfile()
                utils::download.file(url = urls[i], destfile = tmp, quiet = TRUE)
                tmp2 <- tempfile()
                utils::unzip(tmp, exdir = tmp2)
                t <- list.files(tmp2, full.names = TRUE, recursive = TRUE)
                if (any(grep("*\\.shp", t))) {
                    cat(crayon::yellow("\nNote: Shapefiles have attribute name limits of 10 characters."))
                    data <- suppressWarnings(sf::read_sf(t[grep("*\\.shp", t)]) %>% sf::st_set_geometry(NULL))
                } else if (any(grep("*\\.gdb", t))) {
                    data <- suppressWarnings(sf::read_sf(list.dirs(tmp2)[2]) %>% sf::st_set_geometry(NULL))
                } else {
                    cat(crayon::red("\nSpatial data not present within .zip file. Skipped."))
                    cat(crayon::green("\n..........Object not downloaded.............................."))
                    unlink(c(tmp, tmp2), recursive = TRUE)
                    return(list(status = "SKIP"))
                }
                unlink(c(tmp, tmp2), recursive = TRUE)
                data
            } else if (format == "netCDF-4" || format == "netCDF-3" || format == "CF-1.4" || format == "CF-1.3" ||
                       format == "CF-1.2" || format == "CF-1.1" || format == "CF-1.0") {
                tmp <- tempfile()
                utils::download.file(url = urls[i], destfile = tmp, mode = "wb", quiet = TRUE)
                nc <- ncdf4::nc_open(tmp)
                data <- netcdf_to_dataframe(nc)
                unlink(tmp)
                rm(nc) # clean up now because many netCDF files are large
                data
            }
        } else {
            if (format == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || format == "application/vnd.ms-excel") {
                cat(crayon::red("\nThis function uses the DataONE API to read private data objects and currently cannot read .xls or .xlsx files.\nSkipped. Check attributes manually."))
                cat(crayon::green("\n..........Object not downloaded.............................."))
                return(list(status = "SKIP"))
            } else if (format == "netCDF-4" || format == "netCDF-3" || format == "CF-1.4" || format == "CF-1.3" ||
                       format == "CF-1.2" || format == "CF-1.1" || format == "CF-1.0") {
                tmp <- tempfile()
                writeBin(dataone::getObject(mn, objectpid), tmp)
                nc <- ncdf4::nc_open(tmp)
                data <- netcdf_to_dataframe(nc)
                unlink(tmp)
                rm(nc) # clean up now because many netCDF files are large
                data
            } else {
                data <- utils::read.csv(textConnection(rawToChar(dataone::getObject(mn, objectpid))),
                                        nrows = rows_to_read, check.names = FALSE, stringsAsFactors = FALSE)
            }
        }

        cat(crayon::green("\n..........Download complete.............................."))
        return(data)
    },
    error = function(e) {
        cat(crayon::red(paste0("\nFailed to read file at ", urls[i], ". Skipped.")))
        cat(crayon::green("\n..........Object not downloaded.............................."))
        return(list(status = "SKIP"))
    })
}


# Helper function for converting 2D data from a netCDF to a data.frame object
netcdf_to_dataframe <- function(nc) {
    att_names <- names(nc$var)
    dims <- nc$dim
    dim_names <- c()
    for (i in seq_along(dims)) {
        dim_names[i] <- dims[[i]]$name
    }

    var_names <- c(att_names, dim_names)
    # remove duplicates
    dup_indices <- which(duplicated(tolower(var_names)))
    if (length(dup_indices) > 0) {
        var_names <- var_names[-dup_indices]
    }

    data <- lapply(var_names, function(x) ncdf4::ncvar_get(nc, x))
    max_length <- max(unlist(lapply(data, function(x) length(x))))

    results <- data.frame(matrix(ncol = length(data), nrow = max_length))
    names(results) <- var_names
    for (i in seq_along(results)) {
        results[ , i] <- rep_len(data[[i]], length.out = max_length)
    }

    return(results)
}


#' Check congruence of data and metadata attributes for a tabular data object
#'
#' This function checks the congruence of data and metadata attributes
#' for a tabular data object. Supported objects include `dataTable`, `otherEntity`,
#' and `spatialVector` entities. It can be used on its own but is also
#' called by [qa_package()] to check all tabular data objects in a data package.
#'
#' This function checks the following:
#' * Names: Check that column names in attributes match column names in data frame. Possible conditions to check for:
#'     * attributeList does not exist for data frame
#'     * Some of the attributes that exist in the data do not exist in the attributeList
#'     * Some of the attributes that exist in the attributeList do not exist in the data
#'     * Typos in attribute or column names resulting in nonmatches
#' * Domains: Check that attribute types in EML match attribute types in data frame. Possible conditions to check for:
#'     * nominal, ordinal, integer, ratio, dateTime
#'     * If domain is enumerated domain, enumerated values in the data are accounted for in the enumerated definition
#'     * If domain is enumerated domain, enumerated values in the enumerated definition are all represented in the data
#'     * Type of data does not match attribute type
#' * Values: Check that values in data are reasonable. Possible conditions to check for:
#'     * Accidental characters in the data (e.g., one character in a column of integers)
#'     * If missing values are present, missing value codes are also present
#'
#' @param entity (emld) An EML `dataTable`, `otherEntity`, or `spatialVector` associated with the data object.
#' @param data (data.frame) A data frame of the data object.
#' @param doc (emld) The entire EML object. This is necessary if attributes with references are being checked.
#'
#' @return `NULL`
#'
#' @import arcticdatautils
#' @import EML
#' @importFrom crayon green red yellow
#' @importFrom lubridate parse_date_time
#' @importFrom methods is slot
#' @importFrom stats na.omit
#' @importFrom stringr str_split
#' @importFrom utils capture.output head
#'
#' @export
#'
#' @seealso [qa_package()]
#'
#' @examples
#' \dontrun{
#' # Checking a .csv file
#' dataTable <- doc$dataset$dataTable[[1]]
#' data <- readr::read_csv("https://cn.dataone.org/cn/v2/resolve/urn:uuid:...")
#'
#' qa_attributes(dataTable, data)
#' }
qa_attributes <- function(entity, data, doc = NULL) {
    stopifnot(is.data.frame(data))
    if (!is.null(doc) && !methods::is(doc, "emld")) {
        stop("Input should be of class 'emld'.")
    }

    objectpid <- stringr::str_split(entity$physical$distribution$online$url$url, "(?=urn.)", simplify = TRUE)[[2]]

    cat(crayon::green(paste0("\n\n..........Processing object ", objectpid,
                             " (", entity$physical$objectName, ")...............")))

    entity_list <- doc$dataset[names(doc$dataset) %in% c("dataTable", "otherEntity", "spatialVector")]
    names(entity_list) <- 'entity'

    tryCatch({
        suppressWarnings(attributeTable <- EML::get_attributes(entity$attributeList))
        # Check for references
        if (nrow(attributeTable$attributes) == 0) {
            ref_index <- match_reference_to_attributeList(doc, entity)
            if (length(ref_index) > 0) {
                entity2 <- entity_list$entity[[ref_index]]
                attributeTable <- EML::get_attributes(entity2$attributeList)
            }
        }
        attributeNames <- attributeTable$attributes$attributeName

        # Check if attributes are present
        if (is.null(attributeNames)) {
            cat(crayon::red(paste("\nEmpty attribute table for object at", entity$physical$distribution$online$url)))
        }

        # Check for duplicated attributes based on names
        if (any(duplicated(attributeNames))) {
            cat(crayon::red(paste("\nThere are duplicated attribute names in the EML.")))
        }

        header <- as.numeric(entity$physical$dataFormat$textFormat$numHeaderLines)

        if (length(header) > 0 && !is.na(header) && header > 1) {
            names(data) <- NULL
            names(data) <- data[(header - 1), ]
        }

        data_cols <- colnames(data)

        # Check that attribute names match column names
        allequal <- isTRUE(all.equal(data_cols, attributeNames))

        if (!allequal) {
            intersection <- intersect(attributeNames, data_cols)
            nonmatcheml <- attributeNames[!attributeNames %in% intersection]
            nonmatchdata <- data_cols[!data_cols %in% intersection]

            # EML has values that data does not have
            if (length(nonmatcheml) > 0) {
                cat(crayon::red(paste0("\nThe EML includes attributes '", toString(nonmatcheml, sep = ", "), "' that are not present in the data.")))
                cat(crayon::yellow("\nContinuing attribute and data matching without mismatched attributes - fix issues and re-run after first round completion."))
            }

            # Data has values that EML does not have
            if (length(nonmatchdata) > 0) {
                cat(crayon::red(paste0("\nThe data includes attributes '", toString(nonmatchdata, sep = ", "), "' that are not present in the EML.")))
                cat(crayon::yellow("\nContinuing attribute and data matching without mismatched attributes - fix issues and re-run after first round completion."))
            }

            # Values match but are not ordered correctly
            if (length(nonmatcheml) == 0 && length(nonmatchdata) == 0 && allequal == FALSE) {
                cat(crayon::yellow("\nAttribute names match column names but are incorrectly ordered."))
            }

            data <- data[ , which(colnames(data) %in% intersection)]
            attributeTable$attributes <- attributeTable$attributes[which(attributeTable$attributes$attributeName %in% intersection), ]
            attributeTable$attributes <- attributeTable$attributes[order(match(attributeTable$attributes$attributeName, colnames(data))), ]
        }

        # Check that type of column matches type of data based on acceptable DataONE formats
        for (i in seq_along(data)) {
            matchingAtt <- attributeTable$attributes[i, ]
            attClass <- class(data[ , i])
            # If matchingAtt has a dateTime domain, coerce the column based on the date/time format
            if (matchingAtt$measurementScale == "dateTime") {
                attClass <- class(suppressWarnings(lubridate::parse_date_time(data[ , i], orders = c("ymd", "HMS", "ymd HMS", "y", "m", "d", "ym", "md", "m/d/y",
                                                                                                     "d/m/y", "ymd HM", "yq",  "j", "H", "M", "S", "MS", "HM", "I",
                                                                                                     "a", "A", "U", "w", "W"))))
            }

            if (attClass == "numeric" || attClass == "integer" || attClass == "double") {
                if (matchingAtt$measurementScale != "ratio" && matchingAtt$measurementScale != "interval" && matchingAtt$measurementScale != "dateTime") {
                    cat(crayon::yellow(paste0("\nMismatch in attribute type for the attribute '", matchingAtt$attributeName,
                                              "'. Type of data is ", attClass, " which should probably have interval or ratio measurementScale in EML, not ",
                                              matchingAtt$measurementScale, ".")))
                }
            } else if (attClass == "character" || attClass == "logical") {
                if (matchingAtt$measurementScale != "nominal" && matchingAtt$measurementScale != "ordinal") {
                    cat(crayon::yellow(paste0("\nMismatch in attribute type for the attribute '", matchingAtt$attributeName,
                                              "'. Type of data is ", attClass, " which should probably have nominal or ordinal measurementScale in EML, not ",
                                              matchingAtt$measurementScale, ".")))
                }
            } else if (any(attClass %in% c("POSIXct", "POSIXt", "Date", "Period"))) {
                if (matchingAtt$measurementScale != "dateTime") {
                    cat(crayon::yellow(paste0("\nMismatch in attribute type for the attribute '", matchingAtt$attributeName,
                                              "'. Type of data is ", attClass, " which should probably have dateTime measurementScale in EML, not ",
                                              matchingAtt$measurementScale, ".")))
                }
            }
        }

        # Check that enumerated domains match values in data
        if (length(attributeTable$factors) > 0) {
            for (i in seq_along(unique(attributeTable$factors$attributeName))) {
                emlAttName <- unique(attributeTable$factors$attributeName)[i]
                emlUniqueValues <- attributeTable$factors[attributeTable$factors$attributeName == emlAttName, "code"]

                dataUniqueValues <- unique(stats::na.omit(data[[which(colnames(data) == emlAttName)]])) # omit NAs in unique values

                intersection <- intersect(dataUniqueValues, emlUniqueValues)
                nonmatcheml <- emlUniqueValues[!emlUniqueValues %in% intersection]
                nonmatchdata <- dataUniqueValues[!dataUniqueValues %in% intersection]

                if (length(nonmatcheml) > 0) {
                    cat(crayon::yellow(paste0("\nThe EML contains the following enumerated domain values for the attribute '",
                                              as.character(emlAttName), "' that do not appear in the data: ", toString(nonmatcheml, sep = ", "))))
                }

                if (length(nonmatchdata) > 0) {
                    cat(crayon::yellow(paste0("\nThe data contains the following enumerated domain values for the attribute '",
                                              as.character(emlAttName), "' that do not appear in the EML: ", toString(nonmatchdata, sep = ", "))))
                }
            }
        }

        # If there are any missing values in the data, check that there is an associated missing value code in the EML
        for (i in which(colSums(is.na(data)) > 0)) { # only checks for NA values but others like -99 or -999 could be present
            attribute <- attributeTable$attributes[i, ]
            if (is.null(attribute$missingValueCode)) {
                cat(crayon::red(paste0("\nThe attribute '", attribute$attributeName, "' contains missing values but does not have a missing value code.")))
            }
        }
    },
    error = function(e){
        cat(crayon::red("\nError. Processing for object stopped. "))
        cat(crayon::red("Here's the original error message: "))
        cat(crayon::red(message(e)))
    }
    )

    cat(crayon::green("\n..........Processing complete.............................."))
}


# Helper function for matching a reference to an attributeList
# Returns the index of the match
match_reference_to_attributeList <- function(doc, entity) {
    # Get list of 'dataTable', 'otherEntity', etc.
    entity_list <- doc$dataset[names(doc$dataset) %in% c("dataTable", "otherEntity", "spatialVector")]
    names(entity_list) <- 'entity'
    # Get the ref we want to match
    ref <- eml_get_simple(entity, "references")
    # Get all of the references present
    att_lists <- eml_get(entity_list, "attributeList")
    # Get the index of the reference we want
    ids <- lapply(att_lists, eml_get_simple, "id")
    ids <- ids[!names(ids) == "@context"]
    ids <- lapply(ids, function(x){if (length(x > 1)) x[!(x == "@id")]})
    suppressWarnings(
        index <- which(str_detect(ids, paste0('^', ref, '$')) == T))

    return(index)
}


#' Check the ORCIDs of creators in a given EML
#'
#' This function is called by \code{\link{qa_package}}.
#' See \code{\link{qa_package}} documentation for more details.
#'
#' @param doc (emld) Package metadata.
#'
#' @return creator_ORCIDs (character) Returns \code{character(0)} if any tests fail.
#'
#' @noRd
qa_creator_ORCIDs <- function(doc) {
    # Check creators
    creators <- doc$dataset$creator

    if (!is.null(names(creators))){
        creators <- list(doc$dataset$creator)
    }

    creator_ORCIDs <- unlist(arcticdatautils::eml_get_simple(creators, "userId")) %>%
        grep("orcid", ., value = T)

    creator_ORCIDs <- creator_ORCIDs[which(names(creator_ORCIDs) == "userId")]

    isORCID <-  grepl("http[s]?:\\/\\/orcid.org\\/[[:alnum:]]{4}-[[:alnum:]]{4}-[[:alnum:]]{4}-[[:alnum:]]{4}", creator_ORCIDs)
    creator_ORCIDs <- sub("^https://", "http://", creator_ORCIDs)


    if (length(isORCID) != length(creators) || !all(isORCID)) {
        cat(crayon::red("\nEach creator needs to have a proper ORCID."))
        return(character(0))
    } else {
        return(creator_ORCIDs)
    }
}


#' Check rights and access for creators in sysmeta
#'
#' This function is called by \code{\link{qa_package}}.
#' See \code{\link{qa_package}} documentation for more details.
#'
#' @param sysmeta (sysmeta)  Sysmeta of a given object.
#' @param creator_ORCIDs (character) ORCIDs of creators. Result of \code{\link{qa_creator_ORCIDs}}.
#'
#' @noRd
qa_access <- function(sysmeta, creator_ORCIDs) {
    # Check rightsHolder
    if (!(sysmeta@rightsHolder %in% creator_ORCIDs)) {
        cat(crayon::yellow("\nThe rightsHolder for", sysmeta@identifier, "is not set to one of the creators."))
    }

    # Check creator access
    for (creator in creator_ORCIDs) {
        creator_read <- datapack::hasAccessRule(sysmeta, creator, "read")
        creator_write <- datapack::hasAccessRule(sysmeta, creator, "write")
        creator_changePermission <- datapack::hasAccessRule(sysmeta, creator, "changePermission")
        creator_rightsHolder <- sysmeta@rightsHolder %in% creator
        access <- c(creator_read, creator_write, creator_changePermission)

        if (!all(access) & !creator_rightsHolder) {
            cat(crayon::yellow("\nFull access for", sysmeta@identifier, "is not set for creator with ORCID", creator))
        }
    }
}
NCEAS/datamgmt documentation built on June 5, 2023, 6:14 a.m.