R/mudata_subset.R

Defines functions .factorize .tidyselect_vars .vars_rename filter_params.default filter_params filter_locations.default filter_locations filter_data_base filter_data.default filter_data filter_datasets.default filter_datasets select_params.default select_params select_locations.default select_locations select_datasets.default select_datasets subset.mudata rbind.mudata

Documented in filter_data filter_data.default filter_datasets filter_datasets.default filter_locations filter_locations.default filter_params filter_params.default rbind.mudata select_datasets select_datasets.default select_locations select_locations.default select_params select_params.default subset.mudata

#' Combine mudata objects
#'
#' This implmentation of [rbind] combines component tables using [bind_rows][dplyr::bind_rows]
#' and [distinct][dplyr::distinct]. When combined object use different datasets, or when subsets of
#' the same object are recombined, this function works well. When this is not the case, it
#' may be necessary to modify the tables such that when they are passed to [bind_rows][dplyr::bind_rows]
#' and [distinct][dplyr::distinct], no duplicate information exists. This should be picked up by
#' [validate_mudata].
#'
#' @param ... [mudata] objects to combine
#' @param validate Flag to validate the final object using [validate_mudata].
#'
#' @return A mudata object
#' @export
#'
#' @examples
#' rbind(
#'   kentvillegreenwood %>%
#'     select_params(maxtemp) %>%
#'     select_locations(starts_with("KENT")),
#'   kentvillegreenwood %>%
#'     select_params(mintemp) %>%
#'     select_locations(starts_with("GREEN"))
#' )
#'
rbind.mudata <- function(..., validate = TRUE) {
  mudatas <- list(...)
  x_columns <- unname(unique(unlist(lapply(mudatas, attr, "x_columns"))))
  mudata(
    data = do.call(dplyr::bind_rows, lapply(mudatas, function(m) m$data)),
    locations = dplyr::distinct(do.call(dplyr::bind_rows, lapply(mudatas, function(m) m$locations))),
    params = dplyr::distinct(do.call(dplyr::bind_rows, lapply(mudatas, function(m) m$params))),
    datasets = dplyr::distinct(do.call(dplyr::bind_rows, lapply(mudatas, function(m) m$datasets))),
    columns = dplyr::distinct(do.call(dplyr::bind_rows, lapply(mudatas, function(m) m$columns))),
    x_columns = x_columns, validate = validate
  )
}

#' Subset a MuData object
#'
#' This object uses standard evalutation to subset a [mudata] object using
#' character vectors of datasets, params, and locations. The result is subsetted such
#' that all rows in the data table are documented in the other tables (provided)
#' they were to begin with. It is preferred to use [select_locations],
#' [select_params], and [select_datasets] to subset a mudata object,
#' or [filter_data], [filter_locations], [filter_params],
#' and [filter_datasets] to subset by row while maintaining internal
#' consistency.
#'
#' @param x The object to subset
#' @param datasets Vector of datasets to include
#' @param params  Vector of parameters to include
#' @param locations Vector of locations to include
#' @param ... Used to [filter][dplyr::filter] the data table
#'
#' @seealso
#' [select_locations], [select_params], [select_datasets], [filter_data],
#' [filter_locations], [filter_params], and [filter_datasets]
#'
#' @return A subsetted mudata object
#' @export
#'
#' @examples
#' subset(kentvillegreenwood, params = c("mintemp", "maxtemp"))
#'
subset.mudata <- function(x, ..., datasets = NULL, params = NULL,
                          locations = NULL) {
  # enquos ...
  filter_args <- quos(...)

  # filter data
  new_x <- filter_data(x, !!!filter_args)
  if (!is.null(datasets)) {
    new_x <- filter_data(new_x, .data$dataset %in% datasets)
  }
  if (!is.null(locations)) {
    new_x <- filter_data(new_x, .data$location %in% locations)
  }
  if (!is.null(params)) {
    new_x <- filter_data(new_x, .data$param %in% params)
  }

  # return filtered object
  new_x
}

#' Subset a mudata object by identifier
#'
#' These functions use dplyr-like selection syntax to quickly subset a mudata
#' object by param, location, or dataset. Params, locations, an datasets can also
#' be renamed using keyword arguments, identical to dplyr selection syntax.
#'
#' @param .data A mudata object
#' @param ... Quoted names, bare names, or helpers like [starts_with][dplyr::starts_with],
#'   [contains][dplyr::contains], [ends_with][dplyr::ends_with], [one_of][dplyr::one_of],
#'   or [matches][dplyr::matches].
#' @param .factor If TRUE, the new object will keep the order specified by converting
#'   columns to factors. This may be useful for specifying order when using
#'   ggplot2.
#'
#' @seealso
#' [select][dplyr::select], [rename_locations], [distinct_locations],
#' [filter_locations]
#'
#' @rdname selecters
#' @return A subsetted mudata object.
#' @export
#'
#' @examples
#' # renaming can be handy when locations are verbosely named
#' ns_climate %>%
#'   select_locations(
#'     sable_island = starts_with("SABLE"),
#'     nappan = starts_with("NAPPAN"),
#'     baddeck = starts_with("BADDECK")
#'   ) %>%
#'   select_params(ends_with("temp"))
#'
#' # can also use quoted values
#' long_lake %>%
#'   select_params("Pb", "As", "Cr")
#'
#' # can also use negative values to remove params/datasets/locations
#' long_lake %>%
#'   select_params(-Pb)
#'
#' # to get around non-standard evaluation, use one_of()
#' my_params <- c("Pb", "As", "Cr")
#' long_lake %>%
#'   select_params(one_of(my_params))
#'
select_datasets <- function(.data, ...) {
  UseMethod("select_datasets")
}

#' @rdname selecters
#' @export
select_datasets.default <- function(.data, ..., .factor = FALSE) {
  # quo-ify datasets
  datasets <- quos(...)
  # use tidyselect to get dataset names
  datasets <- tidyselect::vars_select(.tidyselect_vars(.data, "dataset"), !!!datasets)
  new_datasets <- names(datasets)
  # use subset() to do the subsetting
  md_out <- filter_data(.data, .data$dataset %in% datasets)
  # rename datasets using rename_dataset
  if (any(new_datasets != datasets)) {
    renamer <- datasets[new_datasets != datasets]
    md_out <- rename_datasets_base(md_out, stats::setNames(names(renamer), renamer))
  }

  if (.factor) {
    md_out <- .factorize(md_out, "dataset", new_datasets)
  }

  md_out
}

#' @rdname selecters
#' @export
select_locations <- function(.data, ...) {
  UseMethod("select_locations")
}

#' @rdname selecters
#' @export
select_locations.default <- function(.data, ..., .factor = FALSE) {
  # quo-ify locations
  locations <- quos(...)
  # use tidyselect to get location names
  locations <- tidyselect::vars_select(.tidyselect_vars(.data, "location"), !!!locations)
  new_locations <- names(locations)
  # use subset() to do the subsetting
  md_out <- filter_data(.data, .data$location %in% locations)
  # rename datasets using rename_locations_base
  if (any(new_locations != locations)) {
    renamer <- locations[new_locations != locations]
    md_out <- rename_locations_base(md_out, stats::setNames(names(renamer), renamer))
  }

  if (.factor) {
    md_out <- .factorize(md_out, "location", new_locations)
  }

  md_out
}

#' @rdname selecters
#' @export
select_params <- function(.data, ...) {
  UseMethod("select_params")
}

#' @rdname selecters
#' @export
select_params.default <- function(.data, ..., .factor = FALSE) {
  # quo-ify params
  params <- quos(...)
  # use tidyselect to get location names
  params <- tidyselect::vars_select(.tidyselect_vars(.data, "param"), !!!params)
  new_params <- names(params)
  # use subset() to do the subsetting
  md_out <- filter_data(.data, .data$param %in% params)
  # rename datasets using rename_params_base
  if (any(new_params != params)) {
    renamer <- params[new_params != params]
    md_out <- rename_params_base(md_out, stats::setNames(names(renamer), renamer))
  }

  if (.factor) {
    md_out <- .factorize(md_out, "param", new_params)
  }

  md_out
}


#' Subset a mudata object by complex expression
#'
#' These methods allow more complex selection criteria than [select_datasets] and
#' family, which only use the identifier values. These methods first subset
#' the required table using the provided expression, then subset other tables
#' to ensure internal consistency.
#'
#' @param .data A [mudata] object
#' @param ... Objects passed to [filter][dplyr::filter] on the appropriate table
#'
#' @seealso
#' [filter][dplyr::filter], [select_locations]
#'
#' @rdname filterers
#' @return A subsetted mudata object
#' @export
#'
#' @examples
#' # select only locations with a latitude above 45
#' ns_climate %>%
#'   filter_locations(latitude > 45)
#'
#' # select only params measured in mm
#' ns_climate %>%
#'   filter_params(unit == "mm")
#'
#' # select only june temperature from ns_climate
#' library(lubridate)
#' ns_climate %>%
#'   filter_data(month(date) == 6)
#'
filter_datasets <- function(.data, ...) {
  UseMethod("filter_datasets")
}

#' @rdname filterers
#' @export
filter_datasets.default <- function(.data, ...) {
  filter_args <- quos(...)
  if (length(filter_args) == 0) {
    return(.data)
  }

  datasets <- .data$datasets %>%
    dplyr::filter(!!!filter_args) %>%
    dplyr::distinct(.data$dataset) %>%
    dplyr::pull("dataset")
  filter_data(.data, .data$dataset %in% datasets)
}

#' @rdname filterers
#' @export
filter_data <- function(.data, ...) {
  UseMethod("filter_data")
}

#' @rdname filterers
#' @export
filter_data.default <- function(.data, ...) {
  filter_args <- quos(...)
  if (length(filter_args) == 0) {
    return(.data)
  }

  # filter data
  dta <- dplyr::filter(.data$data, !!!filter_args)
  filter_data_base(.data, dta)
}

# syncs data with other tables
filter_data_base <- function(.data, dta) {
  pm <- dplyr::semi_join(.data$params, dta, by = c("dataset", "param"))
  lc <- dplyr::semi_join(.data$locations, dta, by = c("dataset", "location"))
  cl <- dplyr::semi_join(.data$columns, dta, by = "dataset")
  ds <- dplyr::semi_join(.data$datasets, dta, by = "dataset")

  # keep class of original
  new_mudata(
    list(
      data = dta,
      locations = lc,
      params = pm,
      datasets = ds,
      columns = cl
    ),
    x_columns = x_columns(.data)
  )
}

#' @rdname filterers
#' @export
filter_locations <- function(.data, ...) {
  UseMethod("filter_locations")
}

#' @rdname filterers
#' @export
filter_locations.default <- function(.data, ...) {
  new_locations <- dplyr::filter(.data$locations, ...)
  new_data <- dplyr::semi_join(.data$data, new_locations, by = c("dataset", "location"))
  filter_data_base(.data, new_data)
}

#' @rdname filterers
#' @export
filter_params <- function(.data, ...) {
  UseMethod("filter_params")
}

#' @rdname filterers
#' @export
filter_params.default <- function(.data, ...) {
  new_params <- dplyr::filter(.data$params, ...)
  new_data <- dplyr::semi_join(.data$data, new_params, by = c("dataset", "param"))
  filter_data_base(.data, new_data)
}

.vars_rename <- function(names, type, ...) {
  tryCatch(
    tidyselect::vars_rename(names, ...),
    error = function(e) {
      if (inherits(e, "vctrs_error_subscript_oob")) {
        rlang::abort(glue::glue("No such {type}: `{e$i}`"), parent = e, class = "no_such_item")
      } else if (inherits(e, "vctrs_error_names_must_be_unique")) {
        rlang::abort(glue::glue("{type}s must be unique"), class = "item_not_unique")
      } else {
        rlang::abort(glue::glue("Error during {type} renaming: {e}"))
      }
    }
  )
}

.tidyselect_vars <- function(x, type) {
  singular <- type
  plural <- paste0(type, "s")
  vars <- as.character(.distinct_vector(x[[plural]], singular))
  attr(vars, "type") <- c(singular, plural)
  vars
}

.factorize <- function(md, column, levels) {
  for (tbl in src_tbls(md)) {
    if (column %in% colnames(md[[tbl]])) {
      md[[tbl]][[column]] <- factor(md[[tbl]][[column]], levels = levels)
    }
  }
  md
}

Try the mudata2 package in your browser

Any scripts or data that you put into this service are public.

mudata2 documentation built on Jan. 22, 2023, 1:48 a.m.