R/old_taxa--taxmap--class.R

Defines functions taxmap

Documented in taxmap

#' Taxmap class
#'
#' A class designed to store a taxonomy and associated information. This class
#' builds on the [taxonomy()] class. User defined data can be stored in the list
#' `obj$data`, where `obj` is a taxmap object. Data that is associated with taxa
#' can be manipulated in a variety of ways using functions like [filter_taxa()]
#' and [filter_obs()]. To associate the items of lists/vectors with taxa, name
#' them by [taxon_ids()]. For tables, add a column named `taxon_id` that stores
#' [taxon_ids()].
#'
#' To initialize a `taxmap` object with associated data sets, use the parsing
#' functions [parse_tax_data()], [lookup_tax_data()], and [extract_tax_data()].
#'
#' @export
#' @param data A list of tables with data associated with the taxa.
#' @param funcs A named list of functions to include in the class. Referring to
#'   the names of these in functions like [filter_taxa()] will execute the
#'   function and return the results. If the function has at least one argument,
#'   the taxmap object is passed to it.
#' @inheritParams taxonomy
#'
#' @family classes
#' @return An `R6Class` object of class [taxmap()]
#'
#' @details on initialize, function sorts the taxon list based on rank (if rank
#'   information is available), see [ranks_ref] for the reference rank names and
#'   orders
#'
#' @examples
#' # The code below shows how to contruct a taxmap object from scratch.
#' # Typically, taxmap objects would be the output of a parsing function,
#' #  not created from scratch, but this is for demostration purposes.
#'
#' notoryctidae <- taxon(
#' name = taxon_name("Notoryctidae"),
#' rank = taxon_rank("family"),
#' id = taxon_id(4479)
#' )
#' notoryctes <- taxon(
#'   name = taxon_name("Notoryctes"),
#'   rank = taxon_rank("genus"),
#'   id = taxon_id(4544)
#' )
#' typhlops <- taxon(
#'   name = taxon_name("typhlops"),
#'   rank = taxon_rank("species"),
#'   id = taxon_id(93036)
#' )
#' mammalia <- taxon(
#'   name = taxon_name("Mammalia"),
#'   rank = taxon_rank("class"),
#'   id = taxon_id(9681)
#' )
#' felidae <- taxon(
#'   name = taxon_name("Felidae"),
#'   rank = taxon_rank("family"),
#'   id = taxon_id(9681)
#' )
#' felis <- taxon(
#'   name = taxon_name("Felis"),
#'   rank = taxon_rank("genus"),
#'   id = taxon_id(9682)
#' )
#' catus <- taxon(
#'   name = taxon_name("catus"),
#'   rank = taxon_rank("species"),
#'   id = taxon_id(9685)
#' )
#' panthera <- taxon(
#'   name = taxon_name("Panthera"),
#'   rank = taxon_rank("genus"),
#'   id = taxon_id(146712)
#' )
#' tigris <- taxon(
#'   name = taxon_name("tigris"),
#'   rank = taxon_rank("species"),
#'   id = taxon_id(9696)
#' )
#' plantae <- taxon(
#'   name = taxon_name("Plantae"),
#'   rank = taxon_rank("kingdom"),
#'   id = taxon_id(33090)
#' )
#' solanaceae <- taxon(
#'   name = taxon_name("Solanaceae"),
#'   rank = taxon_rank("family"),
#'   id = taxon_id(4070)
#' )
#' solanum <- taxon(
#'   name = taxon_name("Solanum"),
#'   rank = taxon_rank("genus"),
#'   id = taxon_id(4107)
#' )
#' lycopersicum <- taxon(
#'   name = taxon_name("lycopersicum"),
#'   rank = taxon_rank("species"),
#'   id = taxon_id(49274)
#' )
#' tuberosum <- taxon(
#'   name = taxon_name("tuberosum"),
#'   rank = taxon_rank("species"),
#'   id = taxon_id(4113)
#' )
#' homo <- taxon(
#'   name = taxon_name("homo"),
#'   rank = taxon_rank("genus"),
#'   id = taxon_id(9605)
#' )
#' sapiens <- taxon(
#'   name = taxon_name("sapiens"),
#'   rank = taxon_rank("species"),
#'   id = taxon_id(9606)
#' )
#' hominidae <- taxon(
#'   name = taxon_name("Hominidae"),
#'   rank = taxon_rank("family"),
#'   id = taxon_id(9604)
#' )
#' unidentified <- taxon(
#'   name = taxon_name("unidentified")
#' )
#'
#' tiger <- hierarchy(mammalia, felidae, panthera, tigris)
#' cat <- hierarchy(mammalia, felidae, felis, catus)
#' human <- hierarchy(mammalia, hominidae, homo, sapiens)
#' mole <- hierarchy(mammalia, notoryctidae, notoryctes, typhlops)
#' tomato <- hierarchy(plantae, solanaceae, solanum, lycopersicum)
#' potato <- hierarchy(plantae, solanaceae, solanum, tuberosum)
#' potato_partial <- hierarchy(solanaceae, solanum, tuberosum)
#' unidentified_animal <- hierarchy(mammalia, unidentified)
#' unidentified_plant <- hierarchy(plantae, unidentified)
#'
#' info <- data.frame(stringsAsFactors = FALSE,
#'                    name = c("tiger", "cat", "mole", "human", "tomato", "potato"),
#'                    n_legs = c(4, 4, 4, 2, 0, 0),
#'                    dangerous = c(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE))
#'
#' abund <- data.frame(code = rep(c("T", "C", "M", "H"), 2),
#'                     sample_id = rep(c("A", "B"), each = 2),
#'                     count = c(1,2,5,2,6,2,4,0),
#'                     taxon_index = rep(1:4, 2))
#'
#' phylopic_ids <- c("e148eabb-f138-43c6-b1e4-5cda2180485a",
#'                   "12899ba0-9923-4feb-a7f9-758c3c7d5e13",
#'                   "11b783d5-af1c-4f4e-8ab5-a51470652b47",
#'                   "9fae30cd-fb59-4a81-a39c-e1826a35f612",
#'                   "b6400f39-345a-4711-ab4f-92fd4e22cb1a",
#'                   "63604565-0406-460b-8cb8-1abe954b3f3a")
#'
#' foods <- list(c("mammals", "birds"),
#'               c("cat food", "mice"),
#'               c("insects"),
#'               c("Most things, but especially anything rare or expensive"),
#'               c("light", "dirt"),
#'               c("light", "dirt"))
#'
#' reaction <- function(x) {
#'   ifelse(x$data$info$dangerous,
#'          paste0("Watch out! That ", x$data$info$name, " might attack!"),
#'          paste0("No worries; its just a ", x$data$info$name, "."))
#' }
#'
#' ex_taxmap <- taxmap(tiger, cat, mole, human, tomato, potato,
#'                     data = list(info = info,
#'                                 phylopic_ids = phylopic_ids,
#'                                 foods = foods,
#'                                 abund = abund),
#'                     funcs = list(reaction = reaction))

taxmap <- function(..., .list = NULL, data = NULL, funcs = list(), named_by_rank = FALSE) {
  Taxmap$new(..., .list = .list, data = data, funcs = funcs, named_by_rank = named_by_rank)
}

Taxmap <- R6::R6Class(
  "Taxmap",
  inherit = Taxonomy,
  public = list(
    data = list(),
    funcs = list(),

    # -------------------------------------------------------------------------
    # Constructor
    initialize = function(..., .list = NULL, data = list(), funcs = list(), named_by_rank = FALSE) {

      # Call `taxonomy` constructor
      super$initialize(..., .list = .list, named_by_rank = named_by_rank)

      # Make sure `data` is in the right format and add to object
      self$data <- init_taxmap_data(self, data, self$input_ids)
      check_taxmap_data(self)

      # Make sure `funcs` is in the right format and add to object
      self$funcs <- validate_taxmap_funcs(funcs)
    },

    # -------------------------------------------------------------------------
    print = function(indent = "", max_rows = 3, max_items = 6,
                     max_width = getOption("width") - 10) {

      # Print taxonomy
      cat(paste0(indent, "<Taxmap>\n"))
      taxon_names <- vapply(self$taxa, function(x) x$name$name, character(1))
      taxon_ids <- names(self$taxa)
      if (length(self$taxa) > 0) {
        limited_print(paste(tid_font(taxon_ids), taxon_names,
                            sep = punc_font(". ")),
                      sep = punc_font(", "),
                      mid = punc_font(" ... "),
                      trunc_char = punc_font("[truncated]"),
                      prefix = paste0(indent, "  ",
                                      length(self$taxa), " taxa:"),
                      type = "cat")
        limited_print(private$make_graph(),
                      sep = punc_font(", "),
                      mid = punc_font(" ... "),
                      trunc_char = punc_font("[truncated]"),
                      prefix = paste0(indent, "  ",
                                      nrow(self$edge_list), " edges:"),
                      type = "cat")
      } else {
        cat("  No taxa\n  No edges\n")
      }

      # Get item names
      if (is.null(names(self$data))) {
        data_names <- paste0("[[", seq_len(length(self$data)), "]]")
      } else {
        data_names <- names(self$data)
        data_names[data_names == ""] <- paste0("[[", which(data_names == ""),
                                               "]]")
      }

      # Print a subset of each item, up to a max number, then just print names
      cat(paste0("  ", length(self$data), " data sets:\n"))
      if (length(self$data) > 0) {
        for (i in 1:min(c(max_items, length(self$data)))) {
          print_item(self, self$data[[i]],
                     name = data_names[i], max_rows = max_rows,
                     max_width = max_width, prefix = "    ")
        }
        if (length(self$data) > max_items) {
          cat(paste0("    And ", length(self$data) - max_items,
                     " more data sets:"))
          limited_print(data_names[(max_items + 1):length(self$data)],
                        type = "cat")
        }
      }

      # Print the names of functions
      cat(paste0("  ", length(self$funcs), " functions:\n"))
      limited_print(prefix = "   ", names(self$funcs), type = "cat")

      invisible(self)
    },

    # -------------------------------------------------------------------------
    # Check that a set of IDs are valid taxon IDs
    is_taxon_id = function(ids) {
      valid_ids <- c(unlist(self$edge_list), names(self$taxa), NA)
      ids %in% valid_ids
    },

    # -------------------------------------------------------------------------
    # Returns the names of things to be accessible using non-standard evaluation
    all_names = function(tables = TRUE, funcs = TRUE, others = TRUE,
                         builtin_funcs = TRUE, warn = FALSE) {
      output <- character(0)

      # Add functions included in the package
      if (builtin_funcs) {
        output <- c(output, private$nse_accessible_funcs)
      }

      # Get column names in each table, removing 'taxon_id'
      is_table <- vapply(self$data, is.data.frame, logical(1))
      if (tables && length(self$data[is_table]) > 0) {
        table_col_names <- unlist(lapply(self$data[is_table], colnames))
        names(table_col_names) <- paste0("data[['",
                                         rep(names(self$data[is_table]),
                                             vapply(self$data[is_table],
                                                    ncol, integer(1))),
                                         "']]")
        table_col_names <- table_col_names[table_col_names != "taxon_id"]
        output <- c(output, table_col_names)
      }

      # Get other object names in data
      is_other <- !is_table
      if (others && length(self$data[is_other]) > 0) {
        other_names <- names(self$data[is_other])
        names(other_names) <- rep("data", length(other_names))
        output <- c(output, other_names)
      }


      # Get function names
      if (funcs && length(self$funcs) > 0) {
        func_names <- names(self$funcs)
        names(func_names) <- rep("funcs", length(func_names))
        output <- c(output, func_names)
      }

      # Check for duplicates
      if (warn) {
        duplicated_names <- unique(output[duplicated(output)])
        if (length(duplicated_names) > 0) {
          warning(paste0("The following names are used more than once: ",
                         paste0(duplicated_names, collapse = ", ")))
        }
      }


      # Add the name to the name of the name and return
      names(output) <- paste0(names(output),
                              ifelse(names(output) == "", "", "[['"),
                              output,
                              ifelse(names(output) == "", "", "']]"))
      return(output)
    },


    # -------------------------------------------------------------------------
    # Get data indexes or other values associated with taxa
    obs = function(data, value = NULL, subset = NULL, recursive = TRUE,
                   simplify = FALSE) {

      # non-standard argument evaluation
      data_used <- eval(substitute(self$data_used(subset)))
      subset <- lazyeval::lazy_eval(lazyeval::lazy(subset), data = data_used)
      subset <- private$parse_nse_taxon_subset(subset)
      obs_taxon_ids <- self$get_data_taxon_ids(data, require = TRUE)

      # Get observations of taxa
      if (is.logical(recursive) && recursive == FALSE) {
        recursive = 0
      }
      if (recursive || is.numeric(recursive)) {
        my_subtaxa <- self$subtaxa(subset = unname(subset),
                                   recursive = recursive,
                                   include_input = TRUE,
                                   value = "taxon_indexes")
        #unname is needed for some reason.. something to look into...
      } else {
        my_subtaxa <- subset
      }
      obs_taxon_index <- match(obs_taxon_ids, self$taxon_ids())
      obs_key <- split(seq_along(obs_taxon_ids), obs_taxon_index)
      output <- stats::setNames(
        lapply(my_subtaxa,function(x) unname(unlist(obs_key[as.character(x)]))),
        names(subset)
      )
      is_null <- vapply(output, is.null, logical(1))
      output[is_null] <- lapply(1:sum(is_null), function(x) numeric(0))

      # Look up values
      if (!is.null(value)) {
        possible_values <- self$get_data(value)[[1]]
        if (length(possible_values) != length(obs_taxon_ids)) {
          stop(call. = FALSE,
               'The value "', value, '" is not the same length as the data set "', data, '".')
        }
        if (! is.null(names(possible_values)) && any(names(possible_values) != names(obs_taxon_ids))) {
          stop(call. = FALSE,
               'The value "', value, '" is in a different order than the data set "', data,
               '" according to taxon IDs.')
        }
        output <- lapply(output, function(i) possible_values[i])
      }

      # Reduce dimensionality
      if (simplify) {
        output <- simplify(output)
      }

      return(output)
    },


    # -------------------------------------------------------------------------
    # Apply a function to data for the observations for each taxon.
    # This is similar to using obs() with lapply() or sapply().
    obs_apply = function(data, func, simplify = FALSE, value = NULL,
                         subset = NULL, recursive = TRUE, ...) {
      my_obs <- self$obs(data, simplify = FALSE, value = value,
                         subset = eval(substitute(subset)),
                         recursive = recursive)
      output <- lapply(my_obs, func, ...)
      if (simplify) {
        output <- simplify(output)
      }
      return(output)
    },

    # -------------------------------------------------------------------------
    # Filter data in a taxmap() object (in obj$data) with a set of conditions.
    filter_obs = function(data, ..., drop_taxa = FALSE, drop_obs = TRUE,
                          subtaxa = FALSE, supertaxa = TRUE,
                          reassign_obs = FALSE, target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      # Parse data option
      data <- parse_dataset(self, data)

      # Check that multiple datasets are the same length
      dataset_length <- vapply(self$data[data], FUN.VALUE = numeric(1),
                               function(one) {
                                 if (is.data.frame(one)) {
                                   return(nrow(one))
                                 } else {
                                   return(length(one))
                                 }
                               })
      dataset_length <- unique(dataset_length)
      if (length(dataset_length) > 1) {
        stop(call. = FALSE,
             "If multiple datasets are filtered at once, then they must the same length. ",
             "The following lengths were found for the specified datasets:\n",
             limited_print(type = "silent", prefix = "  ", dataset_length))
      }

      # non-standard argument evaluation
      selection <- lazyeval::lazy_eval(lazyeval::lazy_dots(...),
                                       data = self$data_used(...))

      # Parse drop_obs option
      drop_obs <- parse_possibly_named_logical(
        drop_obs,
        self$data,
        formals(self$filter_obs)$drop_obs
      )

      # If no selection is supplied, match all rows
      if (length(selection) == 0) {
        selection <- list(seq_len(dataset_length))
      }

      # convert taxon_ids to indexes
      is_char <- vapply(selection, is.character, logical(1))
      if (sum(is_char) > 0) {
        stop(paste("observation filtering with taxon IDs is not currently",
                   "supported. If you want to filter observation by taxon IDs,",
                   "use something like: `obj$data$my_target$taxon_ids %in%",
                   "my_subset`"))
      }

      # convert logical to indexes
      is_logical <- vapply(selection, is.logical, logical(1))
      for (one in selection[is_logical]) {
        if (length(one) != dataset_length) {
          stop(call. = FALSE,
               "All logical filtering criteria must be the same length as the data sets filtered.")
        }
      }
      selection[is_logical] <- lapply(selection[is_logical], which)

      # combine filters
      intersect_with_dups <- function(a, b) {
        rep(sort(intersect(a, b)), pmin(table(a[a %in% b]), table(b[b %in% a])))
      }
      selection <- Reduce(intersect_with_dups, selection)

      # Remove observations
      data_taxon_ids <- NULL
      for (one in data) {
        data_taxon_ids <- c(data_taxon_ids, self$get_data_taxon_ids(one, require = drop_taxa)[selection])
        private$remove_obs(data = one, indexes = selection)
      }

      # Remove unobserved taxa
      data_taxon_ids <- unique(data_taxon_ids)
      if (drop_taxa & ! is.null(data_taxon_ids)) {

        # dont remove taxa that appear in other data sets if they are not also filtered
        sets_to_keep_ids_from <- names(drop_obs[! drop_obs])
        other_ids_to_keep <- unique(unlist(lapply(sets_to_keep_ids_from,
                                                  self$get_data_taxon_ids)))
        taxon_ids_to_keep <- unique(c(data_taxon_ids, other_ids_to_keep))

        # Remove taxa that are not in the filtered data set
        self$filter_taxa(taxon_ids_to_keep, drop_obs = drop_obs,
                         subtaxa = subtaxa, supertaxa = supertaxa,
                         reassign_obs = reassign_obs)
      }

      return(self)
    },


    # -------------------------------------------------------------------------
    # Subsets columns in a data set
    select_obs = function(data, ..., target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      # Parse data option
      data <- parse_dataset(self, data)

      # Check that the datasets are tables
      for (one in data) {
        if (! is.data.frame(self$data[[one]])) {
          stop(paste0('"data" ', one, ' is not a table, so columns cannot be selected.'))
        }
      }

      # Subset columns
      for (one in data) {
        self$data[[one]] <-
          dplyr::bind_cols(self$data[[one]][ , c("taxon_id"), drop = FALSE],
                           dplyr::select(self$data[[one]], ...))

      }

      return(self)
    },


    # -------------------------------------------------------------------------
    # Add columns to tables in obj$data
    mutate_obs = function(data, ..., target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      # Parse data option
      dataset_index <- parse_dataset(self, data, must_be_valid = FALSE, needed = FALSE)

      # Check that only one data is specified
      if (length(data) > 1) {
        stop(call. = FALSE,
             'Only one data can be mutated at a time.')
      }

      # Get data used in expressions to add
      data_used <- self$data_used(...)
      unevaluated <- lazyeval::lazy_dots(...)

      # add columns
      if (length(dataset_index) > 0) {
        # Check that the data is a table
        if (! is.data.frame(self$data[[dataset_index]])) {
          stop(paste0('Dataset "', data, '" is not a table, so columns cannot be added'))
        } else {
          for (index in seq_along(unevaluated)) {
            new_col <- lazyeval::lazy_eval(unevaluated[index], data = data_used)
            data_used <- c(data_used, new_col) # Allows this col to be used in next cols
            self$data[[dataset_index]][[names(new_col)]] <- new_col[[1]]
          }
        }
      } else { # not a current data
        new_dataset <- list()
        for (index in seq_along(unevaluated)) {
          new_col <- lazyeval::lazy_eval(unevaluated[index], data = data_used)
          data_used <- c(data_used, new_col) # Allows this col to be used in next cols
          new_dataset <- c(new_dataset, new_col)
        }
        if (any(names(unevaluated) == "")) { # unnammed inputs cant be put in tables
          if (length(unevaluated) == 1) { # Add as a vector
            message('Adding a new "', class(new_dataset[[1]]),'" vector of length ', length(new_dataset[[1]]), '.')
            self$data[[data]] <- new_dataset[[1]]
          } else {
            stop(call. = FALSE,
                 "Cannot add a new dataset with multiple values if any are unnamed.",
                 " The following input indexes are unnamed:\n",
                 limited_print(which(names(unevaluated) == ""), type = "silent", prefix = "  "))
          }
        } else { # Try to put in new table
          part_lengths <- vapply(new_dataset, length, numeric(1))
          if (length(unique(part_lengths[part_lengths != 1])) == 1) { # All inputs are same length or 1
            new_dataset <- dplyr::as_tibble(new_dataset)
            message('Adding a new ', nrow(new_dataset), ' x ', ncol(new_dataset),
                    ' table called "', data, '"')
            self$data[[data]] <- dplyr::as_tibble(new_dataset)
          } else {
            stop(call. = FALSE,
                 "Cannot make a new table out of multiple values of unequal length.",
                 " The inputs have the following lengths:\n",
                 limited_print(part_lengths, type = "silent", prefix = "  "))

          }
        }
      }

      return(self)
    },


    # -------------------------------------------------------------------------
    # Replace columns of tables in obj$data
    transmute_obs = function(data, ..., target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      # Parse data option
      data <- parse_dataset(self, data)

      # Check that only one data is specified
      if (length(data) > 1) {
        stop(call. = FALSE,
             'Only one dataset can be transmuted at a time.')
      }

      # Check that the dataset is a table
      if (! is.data.frame(self$data[[data]])) {
        stop(paste0('Dataset "', data, '" is not a table, so columns cannot be selected.'))
      }

      if ("taxon_id" %in% colnames(self$data[[data]])) {
        result <- list(taxon_id = self$data[[data]]$taxon_id)
      } else {
        result <- list()
      }
      data_used <- self$data_used(...)
      unevaluated <- lazyeval::lazy_dots(...)
      for (index in seq_along(unevaluated)) {
        new_col <- lazyeval::lazy_eval(unevaluated[index], data = data_used)
        # Allow this col to be used in evaluating the next cols
        data_used <- c(data_used, new_col)
        result[[names(new_col)]] <- new_col[[1]]
      }
      self$data[[data]] <- tibble::as_tibble(result)
      return(self)
    },


    # -------------------------------------------------------------------------
    # Sort columns of tables in obj$data
    arrange_obs = function(data, ..., target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        stop('Use of "target" is depreciated. Use "data" instead.')
      }

      # Parse data option
      data <- parse_dataset(self, data)

      # Check that multiple datasets are the same length
      dataset_length <- vapply(self$data[data], FUN.VALUE = numeric(1),
                               function(one) {
                                 if (is.data.frame(one)) {
                                   return(nrow(one))
                                 } else {
                                   return(length(one))
                                 }
                               })
      dataset_length <- unique(dataset_length)
      if (length(dataset_length) > 1) {
        stop(call. = FALSE,
             "If multiple datasets are filtered at once, then they must the same length. ",
             "The following lengths were found for the specified datasets:\n",
             limited_print(type = "silent", prefix = "  ", dataset_length))
      }

      # Sort observations
      data_used <- self$data_used(...)
      for (one in data) {
        if (is.data.frame(self$data[[one]])) { # if it is a table
          sort_cols <- data_used[! names(data_used) %in% names(self$data[[one]])]
          if (length(sort_cols) == 0) {
            self$data[[one]] <- dplyr::arrange(self$data[[one]], ...)
          } else {
            target_with_extra_cols <-
              dplyr::bind_cols(sort_cols, self$data[[one]])
            self$data[[one]] <-
              dplyr::arrange(target_with_extra_cols, ...)[, -seq_along(sort_cols)]
          }
        } else { # if it is a list or vector
          dummy_table <- data.frame(index = seq_along(self$data[[one]]))
          if (length(data_used)!= 0) {
            dummy_table <- dplyr::bind_cols(data_used, dummy_table)
          }
          dummy_table <- dplyr::arrange(dummy_table, ...)
          self$data[[one]] <- self$data[[one]][dummy_table$index]
        }
      }

      return(self)
    },


    # -------------------------------------------------------------------------
    # Randomly sample some number of observations from a table
    sample_n_obs = function(data, size, replace = FALSE, taxon_weight = NULL,
                            obs_weight = NULL, use_supertaxa = TRUE,
                            collapse_func = mean, ..., target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      # Parse data option
      data <- parse_dataset(self, data)

      # non-standard argument evaluation
      data_used <- eval(substitute(self$data_used(taxon_weight, obs_weight)))
      taxon_weight <- lazyeval::lazy_eval(lazyeval::lazy(taxon_weight),
                                          data = data_used)
      obs_weight <- lazyeval::lazy_eval(lazyeval::lazy(obs_weight),
                                        data = data_used)

      # Check that multiple datasets are the same length
      dataset_length <- vapply(self$data[data], FUN.VALUE = numeric(1),
                               function(one) {
                                 if (is.data.frame(one)) {
                                   return(nrow(one))
                                 } else {
                                   return(length(one))
                                 }
                               })
      dataset_length <- unique(dataset_length)
      if (length(dataset_length) > 1) {
        stop(call. = FALSE,
             "If multiple datasets are sampled at once, then they must the same length. ",
             "The following lengths were found for the specified datasets:\n",
             limited_print(type = "silent", prefix = "  ", dataset_length))
      }

      # Calculate taxon component of taxon weights
      if (is.null(taxon_weight)) {
        obs_taxon_weight <- rep(1, dataset_length)
      } else {
        obs_index <- match(self$get_data_taxon_ids(data, require = TRUE),
                           self$taxon_ids())
        my_supertaxa <- self$supertaxa(recursive = use_supertaxa,
                                       simplify = FALSE, include_input = TRUE,
                                       na = FALSE,
                                       value = "taxon_indexes")
        taxon_weight_product <- vapply(
          my_supertaxa,
          function(x) collapse_func(taxon_weight[x]),
          numeric(1)
        )
        obs_taxon_weight <- taxon_weight_product[obs_index]
      }
      obs_taxon_weight <- obs_taxon_weight / sum(obs_taxon_weight)

      # Calculate observation component of observation weights
      if (is.null(obs_weight)) {
        obs_weight <- rep(1, dataset_length)
      }
      obs_weight <- obs_weight / sum(obs_weight)

      # Combine observation and taxon weight components
      combine_func <- prod
      weight <- mapply(obs_taxon_weight, obs_weight,
                       FUN = function(x, y) combine_func(c(x,y)))
      weight <- weight / sum(weight)

      # Sample observations
      sampled_rows <- sample.int(dataset_length, size = size,
                                 replace = replace, prob = weight)
      self$filter_obs(data, sampled_rows, ...)
    },

    # -------------------------------------------------------------------------
    # Randomly sample some proportion of observations from a table
    sample_frac_obs = function(data, size, replace = FALSE,
                               taxon_weight = NULL, obs_weight = NULL,
                               use_supertaxa = TRUE,
                               collapse_func = mean, ..., target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      # Parse data option
      data <- parse_dataset(self, data)

      # Check that multiple datasets are the same length
      dataset_length <- vapply(self$data[data], FUN.VALUE = numeric(1),
                               function(one) {
                                 if (is.data.frame(one)) {
                                   return(nrow(one))
                                 } else {
                                   return(length(one))
                                 }
                               })
      dataset_length <- unique(dataset_length)
      if (length(dataset_length) > 1) {
        stop(call. = FALSE,
             "If multiple datasets are sampled at once, then they must the same length. ",
             "The following lengths were found for the specified datasets:\n",
             limited_print(type = "silent", prefix = "  ", dataset_length))
      }

      # Call sample_n_obs
      eval(substitute(self$sample_n_obs(data = data,
                                        size = size * dataset_length,
                                        replace = replace,
                                        taxon_weight = taxon_weight,
                                        obs_weight = obs_weight,
                                        use_supertaxa = use_supertaxa,
                                        collapse_func = collapse_func,
                                        ...)))
    },


    # -------------------------------------------------------------------------
    # Count observations for each taxon in a data set
    n_obs = function(data = NULL, target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      # If no data is specified, use the first dataset
      if (is.null(data)) {
        if (length(self$data) > 0) {
          data <- names(self$data)[1]
        } else {
          stop(paste0('There are no data sets to get observation info from.'))
        }
      }

      # Parse data option
      data <- self$get_data(data)

      # Count observations
      vapply(self$obs(data, recursive = TRUE, simplify = FALSE),
             length, numeric(1))
    },

    # -------------------------------------------------------------------------
    # Count observations for each taxon in a data set, including observations
    # for the specific taxon but NOT the observations of its subtaxa.
    n_obs_1 = function(data = NULL, target = NULL) {

      # Check for use of "target"
      if (! is.null(target)) {
        warning(call. = FALSE,
                'Use of "target" is depreciated. Use "data" instead.')
        data <- target
      }

      if (is.null(data)) {
        if (length(self$data) > 0) {
          data <- names(self$data)[1]
        } else {
          stop(paste0('There are no data sets to get observation info from.'))
        }
      }
      vapply(self$obs(data, recursive = FALSE, simplify = FALSE),
             length, numeric(1))
    },

    # Find taxon ids for datasets by dataset name
    #
    # require: if TRUE, require that taxon ids be present, or make an error
    get_data_taxon_ids = function(dataset_name, require = FALSE, warn = FALSE, message = FALSE) {

      stop_or_warn <- function(text) {
        if (require) {
          stop(call. = FALSE, text)
        }
        if (warn) {
          warning(call. = FALSE, text)
        } else if (message) {
          message(text)
        }
      }

      # Get the dataset
      if (length(dataset_name) == 1 && # data is name/index of dataset in object
          (dataset_name %in% names(self$data) || is.numeric(dataset_name))) {
        data <- self$data[[dataset_name]]
      } else { # it is an external data set, not in the object
        data <- dataset_name
        dataset_name <- deparse(substitute(dataset_name))
      }

      # Extract taxon ids if they exist
      if (is.data.frame(data)) {
        if ("taxon_id" %in% colnames(data)) {
          is_valid <- private$ids_are_valid(data$taxon_id)
          if (all(is_valid)) {
            return(data$taxon_id)
          } else  {
            stop_or_warn(paste0('There is a "taxon_id" column in the data set "',
                                dataset_name, '", but the following invalid IDs were found:\n  ',
                                limited_print(data$taxon_id[! is_valid], type = "silent")))
            return(NULL)
          }
        } else {
          stop_or_warn(paste0('There is no "taxon_id" column in the data set "',
                              dataset_name, '", so there are no taxon IDs.'))
          return(NULL)
        }
      } else if (inherits(data, "list") || is.vector(data) || can_be_used_in_taxmap(data)) {
        if (! is.null(names(data))) {
          is_valid <- private$ids_are_valid(names(data))
          if (all(is_valid)) {
            return(names(data))
          } else if (all(! is_valid)) {
            stop_or_warn(paste0('The data set "', dataset_name,
                                '" is named, but not named by taxon ids.'))
            return(NULL)
          } else { # some are valid, but not all
            stop_or_warn(paste0('Dataset "', dataset_name, '" appears to be named by taxon IDs, but contains ', sum(! is_valid), ' invalid IDs:\n  ',
                                limited_print(names(data)[! is_valid], type = "silent")))
            return(NULL)
          }
        } else {
          stop_or_warn(paste0('The data set "', dataset_name,
                              '" is unnamed, ',
                              'so there are no taxon ids.'))
          return(NULL)
        }
      } else {
        stop_or_warn(paste0('I dont know how to extract taxon ids from dataset "', dataset_name,
                            '" of type "', class(data)[1], '".'))
        return(NULL)
      }
    },

    # Get a data set from a taxmap selfect
    get_dataset = function(data) {

      # Convert logicals to numerics
      if (is.logical(data)) {
        if (length(data) != length(self$data)) {
          stop("When using a TRUE/FALSE vector to specify the data set, it must be the same length as the number of data sets",
               call. = FALSE)
        } else {
          data <- which(data)
        }
      }

      # Check for multiple/no values
      if (length(data) == 0) {
        stop('No data specified.', call. = FALSE)
      }
      if (length(data) > 1) {
        stop('Only one dataset can be used.', call. = FALSE)
      }

      # Check that dataset exists
      error_msg <- paste0('The dataset "', data,
                          '" cannot be found. Datasets found include:\n  ',
                          limited_print(paste0("[", seq_along(self$data), "] ", names(self$data)),
                                        type = "silent"))
      if (is.character(data)) {
        if (! data %in% names(self$data)) {
          stop(error_msg, call. = FALSE)
        }
      } else if (is.numeric(data)) {
        if (! data %in% seq_along(self$data)) {
          stop(error_msg, call. = FALSE)
        }
      }

      # Return without printing
      return(self$data[[data]])
    }

  ),

  private = list(
    nse_accessible_funcs = c(
      "taxon_names",
      "taxon_ids",
      "taxon_indexes",
      "classifications",
      "n_supertaxa",
      "n_supertaxa_1",
      "n_subtaxa",
      "n_subtaxa_1",
      "n_leaves",
      "n_leaves_1",
      "taxon_ranks",
      "is_root",
      "is_stem",
      "is_branch",
      "is_leaf",
      "is_internode",
      "n_obs",
      "n_obs_1"
    ),

    # Remove observations from a particular dataset or just remove the taxon ids
    # NOTE: indexes = what is NOT removed
    remove_obs = function(data, indexes, unname_only = FALSE) {
      if (unname_only) {
        if (is.data.frame(self$data[[data]])) {
          self$data[[data]][! indexes, "taxon_id"] <- as.character(NA)
        } else {
          names(self$data[[data]])[! indexes] <- as.character(NA)
        }
      } else {
        if (is.data.frame(self$data[[data]])) {
          self$data[[data]] <-
            self$data[[data]][indexes, , drop = FALSE]
        } else {
          self$data[[data]] <- self$data[[data]][indexes]
        }
      }
    },

    # Checks if a character vector contains only taxon IDs.
    # Returns logical vector same length as input
    ids_are_valid = function(ids_to_check) {
      ids_to_check %in% c(self$taxon_ids(), NA_character_)
    }
  )
)

Try the metacoder package in your browser

Any scripts or data that you put into this service are public.

metacoder documentation built on April 4, 2023, 9:08 a.m.