R/distinct.R

Defines functions duckplyr_distinct distinct.duckplyr_df

# Generated by 02-duckplyr_df-methods.R
utils::globalVariables("___row_number_by")

#' @export
distinct.duckplyr_df <- function(.data, ..., .keep_all = FALSE) {
  dots <- enquos(..., .named = TRUE)

  # Our implementation
  rel_try(call = list(name = "distinct", x = .data, args = list(dots = dots, .keep_all = .keep_all)),
    "Implemented for all cases?" = FALSE,
    {
      # FIXME: avoid column duplication in a cleaner way
      dupes <- duplicated(names(dots), fromLast = TRUE)
      dots <- dots[!dupes]

      rel <- duckdb_rel_from_df(.data)

      oo <- .keep_all || oo_force()

      if (oo) {
        # Push row number as separate projection
        rel <- oo_prep(rel, force = TRUE)

        exprs <- rel_translate_dots(dots, .data)
        all_exprs <- NULL
        if (length(exprs) == 0) {
          exprs <- imap(set_names(names(.data)), relexpr_reference, rel = NULL)
          all_exprs <- exprs
        }

        if (.keep_all) {
          proj_exprs <- all_exprs %||% imap(set_names(names(.data)), relexpr_reference, rel = NULL)
        } else {
          proj_exprs <- exprs
        }

        proj_exprs <- c(proj_exprs, list(
          relexpr_reference("___row_number"),
          relexpr_window(
            relexpr_function("row_number", list()),
            partitions = exprs,
            order_bys = list(relexpr_reference("___row_number")),
            alias = "___row_number_by"
          )
        ))

        rel <- rel_project(rel, unname(proj_exprs))

        expr_filter <- rel_translate(
          quo(`___row_number_by` == 1L),
          names_data = "___row_number_by"
        )
        out_rel <- rel_filter(rel, list(expr_filter))

        out_rel <- oo_restore_order(out_rel, force = TRUE)
        out_rel <- oo_restore_cols(out_rel, extra = "___row_number_by", force = TRUE)
      } else {
        exprs <- rel_translate_dots(dots, .data)
        if (length(exprs) > 0) {
          rel <- rel_project(rel, exprs)
        }
        out_rel <- rel_distinct(rel)
      }

      out <- rel_to_df(out_rel)
      out <- dplyr_reconstruct(out, .data)
      return(out)
    }
  )

  # dplyr forward
  distinct <- dplyr$distinct.data.frame
  out <- distinct(.data, ..., .keep_all = .keep_all)
  return(out)

  # dplyr implementation
  prep <- distinct_prepare(
    .data,
    vars = enquos(...),
    group_vars = group_vars(.data),
    .keep_all = .keep_all,
    caller_env = caller_env()
  )

  out <- prep$data

  cols <- dplyr_col_select(out, prep$vars)
  loc <- vec_unique_loc(cols)

  out <- dplyr_col_select(out, prep$keep)
  dplyr_row_slice(out, loc)
}

duckplyr_distinct <- function(.data, ...) {
  try_fetch(
    .data <- as_duckplyr_df(.data),
    error = function(e) {
      testthat::skip(conditionMessage(e))
    }
  )
  out <- distinct(.data, ...)
  class(out) <- setdiff(class(out), "duckplyr_df")
  out
}

Try the duckplyr package in your browser

Any scripts or data that you put into this service are public.

duckplyr documentation built on Sept. 12, 2024, 9:36 a.m.