R/subsetting.R

Defines functions `$.tbl_df` `$<-.tbl_df` `[[.tbl_df` `[[<-.tbl_df` `[.tbl_df` `[<-.tbl_df` vectbl_as_row_index fix_oob fix_oob_positive warn_oob fix_oob_negative warn_oob_negative vectbl_as_col_index tbl_subset2 tbl_subset_col tbl_subset_row tbl_subassign vectbl_as_new_row_index vectbl_as_new_col_index is_tight_sequence_at_end tbl_subassign_col coalesce2 coalesce2empty tbl_expand_to_nrow tbl_subassign_row check_scalar fast_nrow vectbl_recycle_cols vectbl_recycle_rows vectbl_restore

#' Subsetting tibbles
#'
#' @description
#' Accessing columns, rows, or cells via `$`, `[[`, or `[` is mostly similar to
#' [regular data frames][base::Extract.data.frame]. However, the
#' behavior is different for tibbles and data frames in some cases:
#' * `[` always returns a tibble by default, even if
#'   only one column is accessed.
#' * Partial matching of column names with `$` and `[[` is not supported, a
#'   warning is given and `NULL` is returned.
#' * Only scalars (vectors of length one) or vectors with the
#'   same length as the number of rows can be used for assignment.
#' * Rows outside of the tibble's boundaries cannot be accessed.
#' * When updating with `[[<-` and `[<-`, type changes of entire columns are
#'   supported, but updating a part of a column requires that the new value is
#'   coercible to the existing type.
#'   See [vec_slice()] for the underlying implementation.
#'
#' Unstable return type and implicit partial matching can lead to surprises and
#' bugs that are hard to catch. If you rely on code that requires the original
#' data frame behavior, coerce to a data frame via [as.data.frame()].
#'
#' @details
#' For better compatibility with older code written for regular data frames,
#' `[` supports a `drop` argument which defaults to `FALSE`.
#' New code should use `[[` to turn a column into a vector.
#'
#' @param value A value to store in a row, column, range or cell.
#'   Tibbles are stricter than data frames in what is accepted here.
#'
#' @name subsetting
#' @examples
#' df <- data.frame(a = 1:3, bc = 4:6)
#' tbl <- tibble(a = 1:3, bc = 4:6)
#'
#' # Subsetting single columns:
#' df[, "a"]
#' tbl[, "a"]
#' tbl[, "a", drop = TRUE]
#' as.data.frame(tbl)[, "a"]
#'
#' # Subsetting single rows with the drop argument:
#' df[1, , drop = TRUE]
#' tbl[1, , drop = TRUE]
#' as.list(tbl[1, ])
#'
#' # Accessing non-existent columns:
#' df$b
#' tbl$b
#'
#' df[["b", exact = FALSE]]
#' tbl[["b", exact = FALSE]]
#'
#' df$bd <- c("n", "e", "w")
#' tbl$bd <- c("n", "e", "w")
#' df$b
#' tbl$b
#'
#' df$b <- 7:9
#' tbl$b <- 7:9
#' df$b
#' tbl$b
#'
#' # Identical behavior:
#' tbl[1, ]
#' tbl[1, c("bc", "a")]
#' tbl[, c("bc", "a")]
#' tbl[c("bc", "a")]
#' tbl["a"]
#' tbl$a
#' tbl[["a"]]
NULL

#' @rdname subsetting
#' @inheritParams base::`[.data.frame`
#' @export
`$.tbl_df` <- function(x, name) {
  j <- match(as_string(name), names2(x))
  if (is.na(j)) {
    warn(paste0("Unknown or uninitialised column: `", name, "`."))
    NULL
  } else {
    .subset2(x, j)
  }
}


#' @rdname subsetting
#' @export
`$<-.tbl_df` <- function(x, name, value) {
  tbl_subassign(x, i = NULL, as_string(name), list(value))
}

#' @rdname subsetting
#' @param i,j Row and column indices. If `j` is omitted, `i` is used as column index.
#' @param ... Ignored.
#' @param exact Ignored, with a warning.
#' @export
`[[.tbl_df` <- function(x, i, j, ..., exact = TRUE) {
  if (!exact) {
    warn("`exact` ignored.")
  }

  n_dots <- dots_n(...)
  if (n_dots > 0) {
    warn("Extra arguments ignored.")
  }

  # Ignore exact as an argument for counting
  n_real_args <- nargs() - !missing(exact) - n_dots

  # Column subsetting if nargs() == 2L
  if (n_real_args <= 2L) {
    tbl_subset2(x, j = i)
  } else if (missing(j)) {
    error_assign_columns_non_missing_only()
  } else {
    i <- vec_as_location2(i, fast_nrow(x))

    x <- tbl_subset2(x, j = j)
    if (is.null(x)) return(x)
    vec_slice(x, i)
  }
}

#' @rdname subsetting
#' @inheritParams base::`[[<-.data.frame`
#' @export
`[[<-.tbl_df` <- function(x, i, j, value, ...) {
  if (missing(i)) {
    i <- NULL
  }

  if (missing(j)) {
    j <- NULL
  }

  if (is.null(j) && nargs() < 4) {
    j <- i
    i <- NULL
  }

  if (!is_null(i)) {
    i <- vec_as_location2(i, fast_nrow(x))
  }

  if (is_null(j)) {
    error_assign_columns_non_missing_only()
  }

  value <- list(value)

  j <- vectbl_as_new_col_index(j, x, value)
  check_scalar(j)

  # New columns are added to the end, provide index to avoid matching column
  # names again
  names(value) <- names(j)
  j <- coalesce2(unname(j), ncol(x) + 1L)

  tbl_subassign(x, i, j, value)
}


#' @rdname subsetting
#' @param drop Coerce to a vector if fetching one column via `tbl[, j]` .
#'   Default `FALSE`, ignored when accessing a column via `tbl[j]` .
#' @export
`[.tbl_df` <- function(x, i, j, drop = FALSE, ...) {
  if (missing(i)) {
    i <- NULL
  } else if (is.null(i)) {
    i <- integer()
  }

  if (missing(j)) {
    j <- NULL
  } else if (is.null(j)) {
    j <- integer()
  }

  # Ignore drop as an argument for counting
  n_real_args <- nargs() - !missing(drop)

  # Column or matrix subsetting if nargs() == 2L
  if (n_real_args <= 2L) {
    if (!missing(drop)) {
      warn("`drop` argument ignored for subsetting a tibble with `x[j]`, it has an effect only for `x[i, j]`.")
      drop <- FALSE
    }

    j <- i
    i <- NULL

    # Special case, returns a vector:
    if (is.matrix(j)) {
      return(tbl_subset_matrix(x, j))
    }
  }

  # From here on, i, j and drop contain correct values:
  xo <- tbl_subset_col(x, j = j)

  if (!is.null(i)) {
    xo <- tbl_subset_row(xo, i = i)
  }

  if (drop && length(xo) == 1L) {
    tbl_subset2(xo, 1L)
  } else {
    vectbl_restore(xo, x)
  }
}

#' @rdname subsetting
#' @inheritParams base::`[<-.data.frame`
#' @export
`[<-.tbl_df` <- function(x, i, j, value, ...) {
  if (missing(i)) {
    i <- NULL
  } else if (is.null(i)) {
    i <- integer()
  }

  if (missing(j)) {
    j <- NULL
  } else if (is.null(j)) {
    j <- integer()
  }

  if (is.null(j) && nargs() < 4) {
    j <- i
    i <- NULL

    # Special case:
    if (is.matrix(j)) {
      return(tbl_subassign_matrix(x, j, value))
    }
  }

  tbl_subassign(x, i, j, value)
}

vectbl_as_row_index <- function(i, x) {
  stopifnot(!is.null(i))

  nr <- fast_nrow(x)

  if (is.character(i)) {
    is_na_orig <- is.na(i)

    if (has_rownames(x)) {
      i <- match(i, rownames(x))
    } else {
      i <- string_to_indices(i)
      i <- fix_oob(i, nr, warn = FALSE)
    }

    oob <- which(is.na(i) & !is_na_orig)

    if (has_length(oob)) {
      warn_deprecated("Only valid row names can be used for indexing. Use `NA` as row index to obtain a row full of `NA` values.")
      i[oob] <- NA_integer_
    }
    i
  } else if (is.numeric(i)) {
    i <- fix_oob(i, nr)
    vec_as_location(i, nr)
  } else {
    vec_as_location(i, nr)
  }
}

fix_oob <- function(i, n, warn = TRUE) {
  if (all(i >= 0, na.rm = TRUE)) {
    fix_oob_positive(i, n, warn)
  } else if (all(i <= 0, na.rm = TRUE)) {
    fix_oob_negative(i, n, warn)
  } else {
    # Will throw error in vec_as_location()
    i
  }
}

fix_oob_positive <- function(i, n, warn = TRUE) {
  oob <- which(i > n)
  if (warn) {
    warn_oob(oob, n)
  }

  i[oob] <- NA_integer_
  i
}

warn_oob <- function(oob, n) {
  if (has_length(oob)) {
    warn_deprecated(paste0(
      "Row indexes must be between 0 and the number of rows (", n, "). ",
      "Use `NA` as row index to obtain a row full of `NA` values."
    ))
  }
}

fix_oob_negative <- function(i, n, warn = TRUE) {
  oob <- (i < -n)
  if (warn) {
    warn_oob_negative(which(oob), n)
  }

  i <- i[!oob]
  if (is_empty(i)) i <- seq_len(n)
  i
}

warn_oob_negative <- function(oob, n) {
  if (has_length(oob)) {
    warn_deprecated(paste0(
      "Negative row indexes must be between 0 and the number of rows negated (", -n, "). ",
      "Use `NA` as row index to obtain a row full of `NA` values."
    ))
  }
}

vectbl_as_col_index <- function(j, x) {
  stopifnot(!is.null(j))

  if (anyNA(j)) {
    abort(error_na_column_index(which(is.na(j))))
  }

  subclass_col_index_errors(
    vec_as_location(j, length(x), names(x), arg = "j")
  )
}

tbl_subset2 <- function(x, j) {
  if (is.matrix(j)) {
    signal_soft_deprecated("Calling `[[` with a matrix (recursive subsetting) is deprecated and will eventually be converted to an error.")
    return(as.matrix(x)[[j]])
  } else if (has_length(j, 2) && is.numeric(j)) {
    signal_soft_deprecated("Calling `[[` with a vector of length 2 (recursive subsetting) is deprecated and will eventually be converted to an error.")
    return(.subset2(x, j))
  } else if (is.character(j)) {
    check_scalar(j)
    # NA names should give an error
    if (!is.na(j)) {
      # Don't warn when accessing invalid column names
      return(.subset2(x, j))
    }
  }

  j <- vec_as_location2(j, length(x), names(x), arg = "j")
  .subset2(x, j)
}

tbl_subset_col <- function(x, j) {
  if (is_null(j)) return(x)
  j <- vectbl_as_col_index(j, x)
  xo <- .subset(x, j)
  xo <- set_repaired_names(xo, .name_repair = "minimal")
  set_tibble_class(xo, nrow = fast_nrow(x))
}

tbl_subset_row <- function(x, i) {
  if (is_null(i)) return(x)
  i <- vectbl_as_row_index(i, x)
  xo <- lapply(unclass(x), vec_slice, i = i)
  set_tibble_class(xo, nrow = length(i))
}

tbl_subassign <- function(x, i, j, value) {
  if (is_null(value) || is_atomic(value)) {
    value <- list(value)
  } else {
    value <- unclass(value)
  }
  stopifnot(is_bare_list(value))

  if (is.null(i)) {
    if (is.null(j)) {
      j <- seq_along(x)
    } else {
      j <- vectbl_as_new_col_index(j, x, value)
    }

    value <- vectbl_recycle_cols(value, length(j))
    xo <- tbl_subassign_col(x, j, value)
  } else {
    # Fill up rows first if necessary
    i <- vectbl_as_new_row_index(i, x)
    x <- tbl_expand_to_nrow(x, i)

    if (is.null(j)) {
      xo <- tbl_subassign_row(x, i, value)
    } else {
      # Optimization: match only once
      # (Invariant: x[[j]] is equivalent to x[[vec_as_location(j)]],
      # allowed by corollary that only existing columns can be updated)
      j <- vectbl_as_new_col_index(j, x, value)
      new_cols <- which(is.na(j))
      value <- vectbl_recycle_cols(value, length(j))

      # Fill up columns if necessary
      if (any(new_cols)) {
        x <- tbl_subassign_col(
          x, j[new_cols],
          map(value[new_cols], vec_slice, NA_integer_)
        )
        j <- match(names(j), names(x))
      }

      xj <- tbl_subset_col(x, j)
      xj <- tbl_subassign_row(xj, i, value)
      xo <- tbl_subassign_col(x, j, unclass(xj))
    }
  }

  vectbl_restore(xo, x)
}

vectbl_as_new_row_index <- function(i, x) {
  if (is_bare_numeric(i)) {
    if (anyDuplicated(i)) {
      abort(error_duplicate_row_subscript_for_assignment(i))
    }

    nr <- fast_nrow(x)

    new <- which(i > nr)
    i_new <- i[new]
    i[new] <- NA
    i <- vec_as_location(i, nr)

    if (!is_tight_sequence_at_end(i_new, nr)) {
      abort(error_new_rows_at_end_only(nr, i_new))
    }

    # Restore, caller knows how to deal
    i[new] <- i_new
    i
  } else if (is_logical(i)) {
    # Don't allow OOB logical
    vec_as_location(i, fast_nrow(x))
  } else {
    i <- vectbl_as_row_index(i, x)
    if (anyDuplicated(i, incomparables = NA)) {
      abort(error_duplicate_row_subscript_for_assignment(i))
    }
    i
  }
}

vectbl_as_new_col_index <- function(j, x, value) {
  # Creates a named index vector
  # Values: index,
  # Name: column name (for new columns)

  if (anyNA(j)) {
    abort(error_assign_columns_non_na_only())
  }

  if (is_bare_character(j)) {
    set_names(match(j, names(x)), j)
  } else if (is_bare_numeric(j)) {
    if (anyDuplicated(j)) {
      abort(error_duplicate_column_subscript_for_assignment(j))
    }

    new <- which(j > ncol(x))
    j_new <- j[new]
    j[new] <- NA
    j <- vec_as_location(j, ncol(x), arg = "j")

    if (!is_tight_sequence_at_end(j_new, ncol(x))) {
      abort(error_new_columns_at_end_only(ncol(x), j_new))
    }

    # FIXME: Recycled names are not repaired
    # FIXME: Hard-coded name repair
    names <- vectbl_recycle_cols(names2(value), length(j))
    names[new][names[new] == ""] <- paste0("...", j_new)

    set_names(j, names)
  } else {
    j <- vectbl_as_col_index(j, x)
    if (anyDuplicated(j)) {
      abort(error_duplicate_column_subscript_for_assignment(j))
    }
    j
  }
}

is_tight_sequence_at_end <- function(i_new, n) {
  all(sort(i_new) == seq2(n + 1, n + length(i_new)))
}

tbl_subassign_col <- function(x, j, value) {
  is_data <- !map_lgl(value, is_null)
  nrow <- fast_nrow(x)

  x <- unclass(x)

  # Create or update
  for (jj in which(is_data)) {
    ji <- coalesce2(j[[jj]], names(j)[[jj]])
    x[[ji]] <- vectbl_recycle_rows(value[[jj]], nrow, jj, coalesce2empty(names(j)[[jj]], names(x)[[ji]]))
  }

  # Remove
  j_remove <- j[!is_data & !is.na(j)]
  if (has_length(j_remove)) x <- x[-j_remove]

  # Restore
  set_tibble_class(x, nrow)
}

coalesce2 <- function(x, y) {
  if (is.na(x)) y else x
}

coalesce2empty <- function(x, y) {
  if (x == "") y else x
}

tbl_expand_to_nrow <- function(x, i) {
  nrow <- fast_nrow(x)

  new_nrow <- max(i, nrow)

  if (is.na(new_nrow)) {
    abort(error_assign_rows_non_na_only())
  }

  if (new_nrow != nrow) {
    # FIXME: vec_expand()?
    i_expand <- c(seq_len(nrow), rep(NA_integer_, new_nrow - nrow))
    x <- vec_slice(x, i_expand)
  }

  x
}

tbl_subassign_row <- function(x, i, value) {
  value <- vectbl_recycle_cols(value, ncol(x))

  nrow <- fast_nrow(x)

  x <- unclass(x)

  for (j in seq_along(x)) {
    xj <- x[[j]]
    vec_slice(xj, i) <- value[[j]]
    x[[j]] <- xj
  }

  set_tibble_class(x, nrow)
}

check_scalar <- function(j) {
  if (!has_length(j, 1)) {
    abort(error_need_scalar_column_index())
  }
}

fast_nrow <- function(x) {
  .row_names_info(x, 2L)
}

vectbl_recycle_cols <- function(x, n) {
  subclass_col_recycle_errors(
    vec_recycle(x, n)
  )
}

vectbl_recycle_rows <- function(x, n, j, name) {
  size <- vec_size(x)
  if (size == n) return(x)
  if (size == 1) return(vec_recycle(x, n))

  if (name == "") {
    name <- j
  }

  abort(error_inconsistent_cols(n, name, size, "Existing data"))
}

attrs_names_and_class <- c("names", "row.names", "class")
attrs_names_only <- c("names", "row.names")

vectbl_restore <- function(xo, x, forbidden = attrs_names_and_class) {
  n <- fast_nrow(xo)

  # FIXME: Use new_tibble()?
  attrs <- attributes(x)
  attrs <- attrs[!(names(attrs) %in% forbidden)]

  attributes(xo)[names(attrs)] <- attrs

  # Need to patch manually
  attr(xo, "row.names") <- .set_row_names(n)

  xo
}
krlmlr/tibble documentation built on Jan. 15, 2020, 7:56 a.m.