R/as.traitdata.R

Defines functions print.traitdata as.traitdata

Documented in as.traitdata

#' Standardize format of traitdata
#'
#' Turns wide-table formats (species-traits matrix and occurrence table) into
#' long-table format. As input, the function requires information about which
#' columns contain traits, given as a list of trait-names, and which column
#' contains the taxon name. For tables containing repeated measurements of
#' traits within the same taxon, an occurrenceID should be given or will be
#' created.
#'
#' @param x data.frame object, containing at least a column of taxa, and one or
#'   more columns of trait measurements.
#' @param traits a vector of column names containing traits.
#' @param taxa the name of the column containing taxon names.
#' @param occurrences either a column name containing identifiers for each
#'   individual specimen on which several traits were measured, i.e. an
#'   occurrence of this taxon, or a vector of occurrence identifiers which must
#'   be of the same length as the number of rows of the table. See 'Details'.
#' @param measurements either a column name containing identifiers for each
#'   individual measurement, or a vector of measurement identifiers. This
#'   applies, if single trait measurements span across multiple columns of data,
#'   e.g. multivariate traits like quantitative measures of chemical compounds,
#'   wavelengths or x-y-z coordinates. In most cases, a measurementID will link
#'   the data across rows in the longtable format. Make sure that the traitnames
#'   given reflect the different dimensions of the trait measurement. If
#'   `measurement` remains blank, sequential identifiers will be auto-generated
#'   for each measured value.
#' @param units a single character string or named vector giving the units that
#'   apply to the traits. If only one unit type is given, it will be applied to
#'   all traits.
#' @param datasetID a unique name for this dataset (optional). Will be prepended
#'   to the occurrence ID and measurement ID.
#' @param keep a vector or named vector containing the names of the input
#'   columns to be kept in the output. Vector names will be used to rename the
#'   columns. It is recommended to use accepted column names of the traitdata
#'   standard for renaming!
#' @param drop a vector acting as the inverse of `keep`. All columns listed will
#'   be removed from the output dataset.
#' @param na.rm logical defaults to `TRUE`. If `FALSE`, all measured Values
#'   containing NA will be kept in the output table. This is not reccomended for
#'   most data.
#' @param metadata a list of class metadata, as created by function
#'   `as.metadata()`. Metadata will be added as attributes to the data table.
#'   Possible parameters to the function call are: `rightsHolder`,
#'   `bibliographicCitation`, `license`, `author`, `datasetID`, `datasetName`,
#'   `version`. (see 'Details')
#' @param id.vars a vector of column names to return. Autogenerated from input
#'   column names and 'keep' and 'drop'.
#' @param thesaurus an object of class 'thesaurus' as created by function
#'   `as.thesaurus()`. If provided, this will superimpose trait names provided
#'   in argument `traits`. The thesaurus will be appended as an attribute and
#'   can be revisited by calling `attributes(x)$thesaurus`.
#' @param longtable logical, defaults to `TRUE`. If `FALSE`, data will not be
#'   converted into lontable format, but remain in widetable format as provided.
#'   Note that any columns not indicated in arguments `traits`, `keep`, `units`,
#'   `taxa`, `occurrences` will be dropped from the output.
#' @param conformsTo version of the Ecological Trait-data Standard to which the
#'   data conform. Default procedures return data conform to v0.10. If
#'   `conformsTo = "v0.9"`, data output will be converted to Ecological
#'   Trait-data Standard v0.9.
#' @param ... other arguments, passed on to print function.
#'
#' @details If `occurrences` is left blank, the script will check for the
#'   structure of the input table. If several entries are given for the same
#'   taxon, it assumes that input is an occurrence table, i.e. with multiple
#'   observations of a single taxon,  and assigns identifiers.
#'
#'   Metadata will be stored as attributes to the data frame and can be accessed
#'   via `attributes()`. It is not necessary but highly recommended to provide
#'   metadata when working with multiple trait data files. When appending
#'   datasets using `rbind()`, the metadata information will be added as
#'   additional columns and dataset attribution will be listed in attributes.
#'
#' @return An object of class 'traitdata'.
#' 
#' @export
#' @importFrom reshape2 melt
#'
#' @examples
#'
#' \dontrun{
#' # species-trait matrix:
#'
#' pulldata("carabids")
#'
#' dataset1 <- as.traitdata(carabids,
#'   taxa = "name_correct",
#'   traits = c("body_length", "antenna_length", "metafemur_length"),
#'   units = "mm",
#'   keep = c(basisOfRecordDecription = "source_measurement", measurementRemark = "note")
#'   )
#'
#' # occurrence table:
#'
#' pulldata("heteroptera_raw")
#'
#' dataset2 <- as.traitdata(heteroptera_raw,
#'   taxa = "SpeciesID",
#'   traits = c("Body_length", "Body_width", "Body_height", "Thorax_length",
#'     "Thorax_width", "Head_width", "Eye_width", "Antenna_Seg1", "Antenna_Seg2",
#'     "Antenna_Seg3", "Antenna_Seg4", "Antenna_Seg5", "Front.Tibia_length",
#'     "Mid.Tibia_length", "Hind.Tibia_length", "Front.Femur_length",
#'     "Hind.Femur_length", "Front.Femur_width", "Hind.Femur_width",
#'     "Rostrum_length", "Rostrum_width", "Wing_length", "Wing_widt"),
#'   units = "mm",
#'   keep = c(sex = "Sex", references = "Source", lifestage = "Wing_development"),
#'   metadata = as.metadata(
#'     author = "Gossner MM, Simons NK, Höck L and Weisser WW",
#'     datasetName = "Morphometric traits Heteroptera",
#'     bibliographicCitation = attributes(heteroptera_raw)$citeAs,
#'     license = "http://creativecommons.org/publicdomain/zero/1.0/"
#'     )
#' )
#' }
#' 

as.traitdata <- function(x, 
                         traits = attributes(x)$traits, # name of column or vector of trait names
                         taxa = attributes(x)$taxa, # name of column or vector of species/taxon names
                         occurrences = attributes(x)$occurrences,
                         datasetID = attributes(x)$datasetID,
                         measurements = attributes(x)$measurements,
                         units = attributes(x)$units,
                         keep = attributes(x)$keep,
                         drop = attributes(x)$drop, 
                         na.rm = TRUE,
                         id.vars = names(x)[names(x) %in% keep & !names(x) %in% drop],
                         thesaurus = attributes(x)$thesaurus,
                         metadata = attributes(x)$metadata,
                         longtable = TRUE,
                         conformsTo = "v0.10",
                         ...
) {
  
  input_name <- deparse(substitute(x))
  
  if(is.null(thesaurus) && is.null(traits)) stop("Not able to identify structure of data! Please provide column(s) with trait values in argument 'traits' or as an object of class 'thesaurus' in argument 'thesaurus'!")
  
  if(!is.null(thesaurus) && "thesaurus" %in% class(thesaurus) && is.null(traits)) traits <- names(thesaurus)
  
  # rename taxon column into 'verbatimScientificName'
  if(length(taxa) == 1 && !is.null(taxa)) colnames(x)[colnames(x) == taxa] <- "verbatimScientificName"

  
  if(is.null(occurrences) && length(x$verbatimScientificName) != length(unique(x$verbatimScientificName)) ) {
    x$verbatimScientificName <- as_factor_clocale(x$verbatimScientificName)
    occurrences <- seq_along(x$verbatimScientificName)
    x$occurrenceID <- paste(datasetID, occurrences, sep = "")
    message("it seems you are providing repeated measures of traits on multiple specimens of the same species (i.e. an occurrence table)! Sequential identifiers for the occuences will be added. If your dataset contains user-defined occurrenceIDs you may specify the column name in parameter 'occurrences'. ")
  }

  if(is.null(occurrences) && length(x$verbatimScientificName) == length(unique(x$verbatimScientificName)) ) {
    x$verbatimScientificName <- as_factor_clocale(x$verbatimScientificName)
    message("Input is taken to be a species -- trait matrix. If this is not the case, please provide parameters!")
  }
  
  # if occurrences has a single character string, take this as column name for occurrence IDs 
  if(!is.null(occurrences) && length(occurrences) == 1) { 
      x$verbatimScientificName <- as_factor_clocale(x$verbatimScientificName)
      colnames(x)[colnames(x) == occurrences] <- "occurrenceID" 
      x$occurrenceID <- paste(datasetID, x$occurrenceID, sep = "")
      message("Input is taken to be an occurrence table/an observation -- trait matrix \n(i.e. with individual specimens per row and multiple trait measurements in columns). \nIf this is not the case, please provide parameters! ")
      
  } 
  
  # check for occurrence table format & add occurrence ID 
  #if(!is.null(occurrences) && length(occurrences) == length(x$verbatimScientificName) ) {
  #  x$occurrenceID <- paste(datasetID, occurrences, sep = "")
  #}
  
  # add measurementID, if measurement table or multivariate table is given
  # TODO add condition: question if multiple traits are defined (multivariate measurement?)
  if(!is.null(measurements) && length(measurements) == 1) {
      colnames(x)[colnames(x) == measurements] <- "measurementID"
      measurements <- x$measurementID 
  }
    #TODO specify case for self provided measurementID vector
  
  if(length(traits) == 1 && !is.null(traits)) {
    colnames(x)[colnames(x) == traits] <- "verbatimTraitName"
    out <- x
  }
  
  # produce out while respecting id.vars to keep and drop
  if(length(traits) > 1 & longtable) {
    
    out <- suppressWarnings(
                reshape2::melt(x, 
                         measure.vars = traits[traits %in% colnames(x)], 
                         variable_name = "verbatimTraitName", 
                         id.vars = c("verbatimScientificName", 
                                     c("occurrenceID")[!is.null(occurrences)],
                                     c("measurementID")[!is.null(measurements)], 
                                     id.vars),
                         na.rm = na.rm
                        )
                        )
    
    #rename value column in "verbatimTraitValue"
    names(out)[names(out) == "variable"] <- "verbatimTraitName"
    names(out)[names(out) == "value"] <- "verbatimTraitValue"
    
    out$verbatimTraitName <- as_factor_clocale(out$verbatimTraitName)
    
  } 
  
  if(length(traits) > 1 & !longtable) { 
    
    out <- subset.data.frame(x, 
                             subset = rep_len(TRUE, nrow(x)),
                             select = c("verbatimScientificName", 
                                c("occurrenceID")[!is.null(occurrences)],
                                c("measurementID")[!is.null(measurements)], 
                                traits, 
                                id.vars)
                             )
    message("data were not converted to longtable!")   
    
  }
  
    # if only one trait is wrapped (rename value column)
  
  if(length(traits) == 1) {   
    out <- 
    colnames(out)[colnames(out) == traits] <- "verbatimTraitValue"
                                out$verbatimTraitName <- as.factor(traits)
  } 
    
  
  
  # add measurement ID 
  if(is.null(measurements) && !"measurementID" %in% colnames(out)) {
    out$measurementID <- as_factor_clocale(paste0(datasetID, 1:dim(out)[1]))
  }
  
  if(!is.null(units)) {
    out$verbatimTraitUnit <- NA
    if(length(units) == 1) out$verbatimTraitUnit <- as_factor_clocale(units)
    if(length(units) == length(traits)) out$verbatimTraitUnit <- as_factor_clocale(units[match(out$verbatimTraitName, traits)])
    if(length(units) != length(traits) & !is.null(names(units)) ) {
      out$verbatimTraitUnit <- as_factor_clocale(match(out$verbatimTraitName, names(units))) 
      levels(out$verbatimTraitUnit) <- units
    }
    if(length(units) == ( length(traits) * length(x$verbatimScientificName) )) {
      out$verbatimTraitUnit <- as_factor_clocale(units)
    }
  } 
   
  
  # perform renaming if keep contains named vector
  if(!is.null(names(keep))) {
    named <- keep[names(keep) != ""]
    colnames(out)[match(named, colnames(out))] <- names(named)
  }
  
  
  
  # sort columns according to glossary of terms
  out <- out[, order(match(names(out), glossary$columnName) )]
  
  # set metadata attributes
  
  if("metadata" %in% class(metadata)) {
      attr(out, "metadata") <- metadata
  } else {
    attr(out, "metadata") <- traitdataform::as.metadata(metadata)
  }
  
  attr(out, "metadata")$conformsTo <- "Ecological Trait-data Standard (ETS) v0.10"
    
  if(conformsTo == "v0.9") {
    out <- convert.ets0.9(out)
  }
  
  # set thesaurus attributes
  
  if("thesaurus" %in% class(thesaurus)) {
    attr(out, "thesaurus") <- thesaurus
  } 
  
  class(out) <- c( "traitdata", "data.frame")
  return(out)
  
}


#' @export
print.traitdata <- function(x, ...) {
  
  n_traits <- length(levels(x$verbatimTraitName))
  n_taxa <- length(levels(x$verbatimScientificName))
  n_measurements <- length(levels(x$measurementID))
  metadata <- attributes(x)$metadata
  thesaurus <- attributes(x)$thesaurus

  class(x) <- "data.frame"
  print(x)

  # dataset summary  
  cat("\nThis trait-dataset contains", n_traits, "traits for", n_taxa, 
      "taxa (", n_measurements, "measurements in total).\n" )
  
  print(attributes(x)$metadata)
}
EcologicalTraitData/traitdataform documentation built on June 12, 2022, 5:57 a.m.