R/create_otherEntity.R

Defines functions create_otherEntity

Documented in create_otherEntity

#' @title create EML entity of type otherEntity
#'
#' @description \code{create_otherEntity} generates a EML entity of type
#' otherEntity.
#'
#' @details A otherEntity entity is created from a single file (e.g.,
#' desert_fertilization_sampling_sites.kml) or a directory. The resulting
#' entity is renamed with the package number + base file name + md5sum + file
#' extension. File extension is always .zip if the otherEntity is being created
#' by zipping a directory.
#'
#' @note \code{create_otherEntity} will look for a package identifier and
#' baseURL in config.yaml. `baseURL` must be provided. `package identifier`
#' must be provided if project naming is enabled.
#'
#' @note The target data file or directory can be located anywhere on a local
#' computer but a renamed file or directory will be written to the current
#' working directory.
#'
#' @param target_file_or_directory
#'  (charater) The quoted name and path of the data file or directory.
#' @param description
#'  (character) A description of the data entity.
#' @param overwrite
#'  (logial) If creating otherEntity by zipping a directory, this is a logical
#'  indicating whether to overwrite an existing zip file that has the same name
#'  in the location as the temporary zip object to be created.
#' @param project_naming
#'  (logical) Indicating if the file or directory should be renamed per the
#'  style used by the CAP LTER (default) with the package number + base file
#'  name + file extension. The passed file or directory name will be used if
#'  this parameter is set to FALSE.
#' @param additional_information
#'  (character) Additional information about the data object. Should be quoted,
#'  and accepts (but not required) markdown formatting.
#' @param entity_value_description
#'  (character) For raster data, a desciption of the raster value.
#'
#' @import EML
#' @importFrom dplyr select pull mutate select_if
#' @importFrom readr read_csv
#' @importFrom tools md5sum file_ext
#' @importFrom stringr str_extract
#' @importFrom tools file_ext
#' @importFrom utils file_test
#' @importFrom yaml yaml.load_file
#' @importFrom purrr map_chr
#'
#' @return EML entity of type otherEntity is returned. Additionally, the data
#' file or directory is renamed with the package number + base file name + file
#' extension if project naming is enabled.
#'
#' @examples
#' \dontrun{
#'
#'  # using default parameters
#'
#'  desert_fertilization_sites <- capeml::create_otherEntity(
#'    target_file_or_directory = "path-to-file/sampling_sites.Rmd",
#'    description              = "long-term study site documentation"
#'  )
#'
#'  # set overwrite = TRUE - has the effect of overwriting an existing zipped
#'  # directory of the same name if the otherEntity is being generated by
#'  # zipping a directory.
#'
#'  pass_codebook_2011 <- capeml::create_otherEntity(
#'    target_file_or_directory = "PASS-2011-Codebook-Feb2016rev.pdf",
#'    description              = "PASS 2011 survey codebook",
#'    overwrite                = TRUE
#'  )
#'
#'  # example without project naming:
#'
#'  pass_codebook_2011 <- capeml::create_otherEntity(
#'    target_file_or_directory = "path-to-directory/Desktop/max_temperature",
#'    description              = "rasters of max temperature years 2000-2016",
#'    project_naming           = FALSE
#'  )
#'
#' }
#'
#' @export
#'
create_otherEntity <- function(
  target_file_or_directory,
  description,
  overwrite              = FALSE,
  project_naming         = TRUE,
  additional_information = NULL,
  entity_value_description
  ) {

  # required parameters --------------------------------------------------------

  # description

  if (missing("description")) {

    stop("please provide a description for this object")

  }

  # target file or directly

  if (missing("target_file_or_directory")) {

    stop("specify the name of the file or directory")

  }


  # defaults -------------------------------------------------------------------

  is_directory <- FALSE
  is_shape     <- FALSE


  # zip if targetfile is a directory -------------------------------------------

  # create zip of directory if target_file_or_directory is in fact a directory
  if (file_test(op = "-d", x = target_file_or_directory)) {

    # flag that target_file_or_directory is a directory
    is_directory <- TRUE

    # new object name: base name + zip extension
    zippedObject <- paste0(basename(target_file_or_directory), ".zip")

    # need full path to zip a directory
    targetFileFullPath <- path.expand(target_file_or_directory)

    # check if we are zipping a shapefile
    if ("shp" %in% purrr::map_chr(.x = list.files(targetFileFullPath), ~ tools::file_ext(.x))) {

      is_shape <- TRUE

    }

    # stop if zipping the directory will overwrite an existing object without
    # explicit overwrite - note that this is checking the existence of the
    # temporary object (e.g., dirname.zip), not the ultimate object (e.g.,
    # packagenumber_dirname_md5hash.zip)
    if (file.exists(zippedObject) && overwrite == FALSE) {

      stop("zip with that name at that location already exists, set 'overwrite = TRUE'")

    }

    # zip the target directory
    system(
      paste0(
        "zip -jXr ",
        shQuote(zippedObject, type = "sh"),
        " ",
        shQuote(targetFileFullPath, type = "sh")
      )
    )

    target_file_or_directory <- zippedObject

  }


  # object size, type, and hash ------------------------------------------------

  # determine the file extension
  fileExtension <- tools::file_ext(target_file_or_directory)

  # set authentication (md5)
  fileAuthentication                <- EML::eml$authentication(method = "MD5")
  fileAuthentication$authentication <- tools::md5sum(target_file_or_directory)

  # set file size
  fileSize      <- EML::eml$size(unit = "byte")
  fileSize$size <- deparse(file.size(target_file_or_directory))

  # set file format
  if (grepl("pdf", tools::file_ext(target_file_or_directory), ignore.case = TRUE)) {

    fileDataFormat <- EML::eml$dataFormat(
      externallyDefinedFormat = EML::eml$externallyDefinedFormat(formatName = "Portable Document Format")
    )

  } else if (is_shape == TRUE) {

    fileDataFormat <- EML::eml$dataFormat(
      externallyDefinedFormat = EML::eml$externallyDefinedFormat(formatName = "Esri Shapefile (zipped)")
    )

  } else {

    fileDataFormat <- EML::eml$dataFormat(
      externallyDefinedFormat = EML::eml$externallyDefinedFormat(formatName = fileExtension)
    )

  }

  targetFileBaseName <- basename(target_file_or_directory)
  directoryName      <- dirname(target_file_or_directory)
  directoryNameFull  <- sub("/$", "", path.expand(directoryName))
  pathToFile         <- path.expand(target_file_or_directory)


  # retrieve dataset details from config.yaml

  configurations <- read_package_configuration()


  # project naming -------------------------------------------------------------

  if (project_naming == TRUE) {

    identifier <- configurations$identifier

    targetFileBaseName <- paste0(
      identifier, "_",
      stringr::str_extract(targetFileBaseName, "^[^\\.]*"),
      ".",
      fileExtension
    )

    file.copy(
      from = target_file_or_directory,
      to   = paste0(directoryNameFull, "/", targetFileBaseName)
    )

  }


  # construct physical ---------------------------------------------------------

  fileURL <- configurations$fileURL

  fileDistribution <- EML::eml$distribution(
    EML::eml$online(url = paste0(fileURL, targetFileBaseName))
  )

  filePhysical <- EML::eml$physical(
    objectName     = targetFileBaseName,
    authentication = fileAuthentication,
    size           = fileSize,
    dataFormat     = fileDataFormat,
    distribution   = fileDistribution
  )


  # create otherEntity ---------------------------------------------------------

  newOE <- EML::eml$otherEntity(
    entityName        = targetFileBaseName,
    entityDescription = description,
    physical          = filePhysical,
    entityType        = fileExtension,
    id                = targetFileBaseName
  )


  # attributes -----------------------------------------------------------------

  # helper function to remove missing columns
  not_all_na <- function(x) {
    !all(is.na(x))
  }

  if (file.exists(paste0(tools::file_path_sans_ext(target_file_or_directory), "_attrs.yaml"))) {

    attrs <- yaml::yaml.load_file(paste0(tools::file_path_sans_ext(target_file_or_directory), "_attrs.yaml"))
    attrs <- yaml::yaml.load(attrs)
    attrs <- tibble::enframe(attrs) |>
    tidyr::unnest_wider(value) |>
    dplyr::select(-one_of("name"))

    # column classes to vector (req'd by set_attributes)
    classes <- attrs |>
    dplyr::pull(columnClasses)

    # - copy attributeDefinition to defintion as appropriate;
    # - remove col classes from attrs (req'd by set_attributes);
    # - remove empty columns (especially max & min, which
    #   can throw an error for data without any numeric cols)
    attrs <- attrs |>
    dplyr::mutate(
      definition = case_when(
        grepl("character", columnClasses) & ((is.na(definition) | definition == "")) ~ attributeDefinition,
        TRUE ~ definition
      )
      ) |>
    dplyr::select(-columnClasses) |>
    dplyr::select_if(not_all_na)

    has_attributes <- TRUE

  } else {

    has_attributes <- FALSE

  }


  # factors --------------------------------------------------------------------

  # use factors if exist

  if (file.exists(paste0(tools::file_path_sans_ext(target_file_or_directory), "_factors.yaml"))) {

    entity_factors <- yaml.load_file(paste0(tools::file_path_sans_ext(target_file_or_directory), "_factors.yaml")) |>
      yaml::yaml.load() |>
      tibble::enframe() |>
      tidyr::unnest_wider(value) |>
      tidyr::unnest_wider(attribute) |>
      tidyr::unnest_longer(levels) |>
      tidyr::unnest_wider(levels) |>
      dplyr::select(-one_of("name"))

    has_factors <- TRUE

  } else {

    has_factors <- FALSE

  }


  # compile components for attributeList ---------------------------------------

  # condition: factors only

  if (has_attributes == FALSE && has_factors == TRUE) {

    if (missing(entity_value_description)) {

      entity_value_description <- "METADATA_NOT_PROVIDED"

    }

    attrs_factor_only <- data.frame(
      attributeName       = entity_factors[["attributeName"]][[1]],
      attributeDefinition = entity_value_description
    )

    attr_list <- EML::set_attributes(
      attributes  = attrs_factor_only,
      factors     = entity_factors,
      col_classes = "factor"
    )

    has_factor_only <- TRUE

    message("added: ", targetFileBaseName, " factors")

  } else {

    has_factor_only <- FALSE

  }

  # condition: attributes and factors

  if (has_attributes == TRUE && has_factors == TRUE) {

    attr_list <- EML::set_attributes(
      attributes  = attrs,
      factors     = entity_factors,
      col_classes = classes
    )

    message("added: ", targetFileBaseName, " attributes")
    message("added: ", targetFileBaseName, " factors")

  }

  # condition: attributes only

  if (has_attributes == TRUE && has_factors == FALSE) {

    attr_list <- EML::set_attributes(
      attributes  = attrs,
      col_classes = classes
    )

    message("added: ", targetFileBaseName, " attributes")

  }

  # add attributeList

  if (has_attributes == TRUE | has_factor_only == TRUE) {

    newOE$attributeList <- attr_list

  }


  # additional information -----------------------------------------------------

  if (!is.null(additional_information)) {

    newOE$additionalInfo <- additional_information

  }


  # remove temporary objects ---------------------------------------------------

  # remove temporary zip if otherEntity is created by zipping a directory

  if (is_directory == TRUE && file.exists(zippedObject)) {

    file.remove(zippedObject)

  }


  # return other entity object -------------------------------------------------

  message("created otherEntity: ", targetFileBaseName, " sans attributes")
  return(newOE)

}
CAPLTER/capeml documentation built on Oct. 1, 2024, 7:38 a.m.