tests/testthat/test-joins.r

context("Joins")

# Univariate keys --------------------------------------------------------------

a <- data.frame(x = c(1, 1, 2, 3), y = 1:4)
b <- data.frame(x = c(1, 2, 2, 4), z = 1:4)

test_that("univariate inner join has all columns, repeated matching rows", {
  j <- inner_join(a, b, "x")

  expect_equal(names(j), c("x", "y", "z"))
  expect_equal(j$y, c(1, 2, 3, 3))
  expect_equal(j$z, c(1, 1, 2, 3))
})

test_that("univariate left join has all columns, all rows", {
  j1 <- left_join(a, b, "x")
  j2 <- left_join(b, a, "x")

  expect_equal(names(j1), c("x", "y", "z"))
  expect_equal(names(j2), c("x", "z", "y"))

  expect_equal(j1$z, c(1, 1, 2, 3, NA))
  expect_equal(j2$y, c(1, 2, 3, 3, NA))
})

test_that("univariate semi join has x columns, matching rows", {
  j1 <- semi_join(a, b, "x")
  j2 <- semi_join(b, a, "x")

  expect_equal(names(j1), c("x", "y"))
  expect_equal(names(j2), c("x", "z"))

  expect_equal(j1$y, 1:3)
  expect_equal(j2$z, 1:3)
})

test_that("univariate anti join has x columns, missing rows", {
  j1 <- anti_join(a, b, "x")
  j2 <- anti_join(b, a, "x")

  expect_equal(names(j1), c("x", "y"))
  expect_equal(names(j2), c("x", "z"))

  expect_equal(j1$y, 4)
  expect_equal(j2$z, 4)
})

test_that("univariate right join has all columns, all rows", {
  j1 <- right_join(a, b, "x")
  j2 <- right_join(b, a, "x")

  expect_equal(names(j1), c("x", "y", "z"))
  expect_equal(names(j2), c("x", "z", "y"))

  expect_equal(j1$x, c(1, 1, 2, 2, 4))
  expect_equal(j1$y, c(1, 2, 3, 3, NA))
  expect_equal(j1$z, c(1, 1, 2, 3, 4))

  expect_equal(j2$x, c(1, 1, 2, 2, 3))
  expect_equal(j2$y, c(1, 2, 3, 3, 4))
  expect_equal(j2$z, c(1, 1, 2, 3, NA))
})

# Bivariate keys ---------------------------------------------------------------

c <- data.frame(
  x = c(1, 1, 2, 3),
  y = c(1, 1, 2, 3),
  a = 1:4
)
d <- data.frame(
  x = c(1, 2, 2, 4),
  y = c(1, 2, 2, 4),
  b = 1:4
)

test_that("bivariate inner join has all columns, repeated matching rows", {
  j <- inner_join(c, d, c("x", "y"))

  expect_equal(names(j), c("x", "y", "a", "b"))
  expect_equal(j$a, c(1, 2, 3, 3))
  expect_equal(j$b, c(1, 1, 2, 3))
})

test_that("bivariate left join has all columns, all rows", {
  j1 <- left_join(c, d, c("x", "y"))
  j2 <- left_join(d, c, c("x", "y"))

  expect_equal(names(j1), c("x", "y", "a", "b"))
  expect_equal(names(j2), c("x", "y", "b", "a"))

  expect_equal(j1$b, c(1, 1, 2, 3, NA))
  expect_equal(j2$a, c(1, 2, 3, 3, NA))
})

test_that("bivariate semi join has x columns, matching rows", {
  j1 <- semi_join(c, d, c("x", "y"))
  j2 <- semi_join(d, c, c("x", "y"))

  expect_equal(names(j1), c("x", "y", "a"))
  expect_equal(names(j2), c("x", "y", "b"))

  expect_equal(j1$a, 1:3)
  expect_equal(j2$b, 1:3)
})

test_that("bivariate anti join has x columns, missing rows", {
  j1 <- anti_join(c, d, c("x", "y"))
  j2 <- anti_join(d, c, c("x", "y"))

  expect_equal(names(j1), c("x", "y", "a"))
  expect_equal(names(j2), c("x", "y", "b"))

  expect_equal(j1$a, 4)
  expect_equal(j2$b, 4)
})


# Duplicate column names --------------------------------------------------

e <- data.frame(x = c(1, 1, 2, 3), z = 1:4)
f <- data.frame(x = c(1, 2, 2, 4), z = 1:4)

test_that("univariate inner join has all columns, repeated matching rows", {
  j <- inner_join(e, f, "x")

  expect_equal(names(j), c("x", "z.x", "z.y"))
  expect_equal(j$z.x, c(1, 2, 3, 3))
  expect_equal(j$z.y, c(1, 1, 2, 3))
})

test_that("univariate left join has all columns, all rows", {
  j1 <- left_join(e, f, "x")
  j2 <- left_join(f, e, "x")

  expect_equal(names(j1), c("x", "z.x", "z.y"))
  expect_equal(names(j2), c("x", "z.x", "z.y"))

  expect_equal(j1$z.y, c(1, 1, 2, 3, NA))
  expect_equal(j2$z.y, c(1, 2, 3, 3, NA))
})

test_that("can control suffixes with suffix argument", {
  j1 <- inner_join(e, f, "x", suffix = c("1", "2"))
  j2 <- left_join(e, f, "x", suffix = c("1", "2"))
  j3 <- right_join(e, f, "x", suffix = c("1", "2"))
  j4 <- full_join(e, f, "x", suffix = c("1", "2"))

  expect_named(j1, c("x", "z1", "z2"))
  expect_named(j2, c("x", "z1", "z2"))
  expect_named(j3, c("x", "z1", "z2"))
  expect_named(j4, c("x", "z1", "z2"))
})

test_that("can handle empty string in suffix argument, left side (#2228, #2182, #2007)", {
  j1 <- inner_join(e, f, "x", suffix = c("", "2"))
  j2 <- left_join(e, f, "x", suffix = c("", "2"))
  j3 <- right_join(e, f, "x", suffix = c("", "2"))
  j4 <- full_join(e, f, "x", suffix = c("", "2"))

  expect_named(j1, c("x", "z", "z2"))
  expect_named(j2, c("x", "z", "z2"))
  expect_named(j3, c("x", "z", "z2"))
  expect_named(j4, c("x", "z", "z2"))
})

test_that("can handle empty string in suffix argument, right side (#2228, #2182, #2007)", {
  j1 <- inner_join(e, f, "x", suffix = c("1", ""))
  j2 <- left_join(e, f, "x", suffix = c("1", ""))
  j3 <- right_join(e, f, "x", suffix = c("1", ""))
  j4 <- full_join(e, f, "x", suffix = c("1", ""))

  expect_named(j1, c("x", "z1", "z"))
  expect_named(j2, c("x", "z1", "z"))
  expect_named(j3, c("x", "z1", "z"))
  expect_named(j4, c("x", "z1", "z"))
})

test_that("disallow empty string in both sides of suffix argument (#2228)", {
  expect_error(
    inner_join(e, f, "x", suffix = c("", "")),
    "`suffix` can't be empty string for both `x` and `y` suffixes",
    fixed = TRUE
  )
  expect_error(
    left_join(e, f, "x", suffix = c("", "")),
    "`suffix` can't be empty string for both `x` and `y` suffixes",
    fixed = TRUE
  )
  expect_error(
    right_join(e, f, "x", suffix = c("", "")),
    "`suffix` can't be empty string for both `x` and `y` suffixes",
    fixed = TRUE
  )
  expect_error(
    full_join(e, f, "x", suffix = c("", "")),
    "`suffix` can't be empty string for both `x` and `y` suffixes",
    fixed = TRUE
  )
})

test_that("disallow NA in any side of suffix argument", {
  expect_error(
    inner_join(e, f, "x", suffix = c(".x", NA)),
    "`suffix` can't be NA",
    fixed = TRUE
  )
  expect_error(
    left_join(e, f, "x", suffix = c(NA, ".y")),
    "`suffix` can't be NA",
    fixed = TRUE
  )
  expect_error(
    right_join(e, f, "x", suffix = c(NA_character_, NA)),
    "`suffix` can't be NA",
    fixed = TRUE
  )
  expect_error(
    full_join(e, f, "x", suffix = c("x", NA)),
    "`suffix` can't be NA",
    fixed = TRUE
  )
})

test_that("doesn't add suffix to by columns in x (#3307)", {
  j1 <- inner_join(e, f, by = c("x" = "z"))
  j2 <- left_join(e, f, by = c("x" = "z"))
  j3 <- right_join(e, f, by = c("x" = "z"))
  j4 <- full_join(e, f, by = c("x" = "z"))

  expect_named(j1, c("x", "z", "x.y"))
  expect_named(j2, c("x", "z", "x.y"))
  expect_named(j3, c("x", "z", "x.y"))
  expect_named(j4, c("x", "z", "x.y"))
})

g <- data.frame(A = 1, A.x = 2)
h <- data.frame(B = 3, A.x = 4, A = 5)

test_that("can handle 'by' columns with suffix (#3266)", {
  j1 <- inner_join(g, h, "A.x")
  j2 <- left_join(g, h, "A.x")
  j3 <- right_join(g, h, "A.x")
  j4 <- full_join(g, h, "A.x")

  expect_named(j1, c("A.x.x", "A.x", "B", "A.y"))
  expect_named(j2, c("A.x.x", "A.x", "B", "A.y"))
  expect_named(j3, c("A.x.x", "A.x", "B", "A.y"))
  expect_named(j4, c("A.x.x", "A.x", "B", "A.y"))
})

test_that("can handle 'by' columns with suffix, reverse (#3266)", {
  j1 <- inner_join(h, g, "A.x")
  j2 <- left_join(h, g, "A.x")
  j3 <- right_join(h, g, "A.x")
  j4 <- full_join(h, g, "A.x")

  expect_named(j1, c("B", "A.x", "A.x.x", "A.y"))
  expect_named(j2, c("B", "A.x", "A.x.x", "A.y"))
  expect_named(j3, c("B", "A.x", "A.x.x", "A.y"))
  expect_named(j4, c("B", "A.x", "A.x.x", "A.y"))
})

test_that("check suffix input", {
  expect_error(
    inner_join(e, f, "x", suffix = letters[1:3]),
    "`suffix` must be a character vector of length 2, not a character vector of length 3",
    fixed = TRUE
  )
  expect_error(
    inner_join(e, f, "x", suffix = letters[1]),
    "`suffix` must be a character vector of length 2, not a character vector of length 1",
    fixed = TRUE
  )
  expect_error(
    inner_join(e, f, "x", suffix = 1:2),
    "`suffix` must be a character vector of length 2, not an integer vector of length 2",
    fixed = TRUE
  )
})


# Misc --------------------------------------------------------------------

test_that("inner_join does not segfault on NA in factors (#306)", {
  a <- data.frame(x = c("p", "q", NA), y = c(1, 2, 3), stringsAsFactors = TRUE)
  b <- data.frame(x = c("p", "q", "r"), z = c(4, 5, 6), stringsAsFactors = TRUE)
  expect_warning(res <- inner_join(a, b, "x"), "joining factors with different levels")
  expect_equal(nrow(res), 2L)
})

test_that("joins don't reorder columns #328", {
  a <- data.frame(a = 1:3)
  b <- data.frame(a = 1:3, b = 1, c = 2, d = 3, e = 4, f = 5)
  res <- left_join(a, b, "a")
  expect_equal(names(res), names(b))
})

test_that("join handles type promotions #123", {
  df <- data.frame(
    V1 = c(rep("a", 5), rep("b", 5)),
    V2 = rep(c(1:5), 2),
    V3 = c(101:110),
    stringsAsFactors = FALSE
  )

  match <- data.frame(
    V1 = c("a", "b"),
    V2 = c(3.0, 4.0),
    stringsAsFactors = FALSE
  )
  res <- semi_join(df, match, c("V1", "V2"))
  expect_equal(res$V2, 3:4)
  expect_equal(res$V3, c(103L, 109L))
})

test_that("indices don't get mixed up when nrow(x) > nrow(y). #365", {
  a <- data.frame(V1 = c(0, 1, 2), V2 = c("a", "b", "c"), stringsAsFactors = FALSE)
  b <- data.frame(V1 = c(0, 1), V3 = c("n", "m"), stringsAsFactors = FALSE)
  res <- inner_join(a, b, by = "V1")
  expect_equal(res$V1, c(0, 1))
  expect_equal(res$V2, c("a", "b"))
  expect_equal(res$V3, c("n", "m"))
})

test_that("join functions error on column not found #371", {
  expect_error(
    left_join(data.frame(x = 1:5), data.frame(y = 1:5), by = "x"),
    "`by` can't contain join column `x` which is missing from RHS",
    fixed = TRUE
  )
  expect_error(
    left_join(data.frame(x = 1:5), data.frame(y = 1:5), by = "y"),
    "`by` can't contain join column `y` which is missing from LHS",
    fixed = TRUE
  )
  expect_error(
    left_join(data.frame(x = 1:5), data.frame(y = 1:5)),
    "`by` required, because the data sources have no common variables",
    fixed = TRUE
  )

  expect_error(
    left_join(data.frame(x = 1:5), data.frame(y = 1:5), by = 1:3),
    "`by` must be a (named) character vector, list, or NULL for natural joins (not recommended in production code), not an integer vector",
    fixed = TRUE
  )
})

test_that("inner_join is symmetric (even when joining on character & factor)", {
  foo <- tibble(id = factor(c("a", "b")), var1 = "foo")
  bar <- tibble(id = c("a", "b"), var2 = "bar")

  expect_warning(tmp1 <- inner_join(foo, bar, by = "id"), "joining factor and character")
  expect_warning(tmp2 <- inner_join(bar, foo, by = "id"), "joining character vector and factor")

  expect_is(tmp1$id, "character")
  expect_is(tmp2$id, "character")

  expect_equal(names(tmp1), c("id", "var1", "var2"))
  expect_equal(names(tmp2), c("id", "var2", "var1"))

  expect_equal(tmp1, tmp2)
})

test_that("inner_join is symmetric, even when type of join var is different (#450)", {
  foo <- tbl_df(data.frame(id = 1:10, var1 = "foo"))
  bar <- tbl_df(data.frame(id = as.numeric(rep(1:10, 5)), var2 = "bar"))

  tmp1 <- inner_join(foo, bar, by = "id")
  tmp2 <- inner_join(bar, foo, by = "id")

  expect_equal(names(tmp1), c("id", "var1", "var2"))
  expect_equal(names(tmp2), c("id", "var2", "var1"))

  expect_equal(tmp1, tmp2)
})

test_that("left_join by different variable names (#617)", {
  x <- tibble(x1 = c(1, 3, 2))
  y <- tibble(y1 = c(1, 2, 3), y2 = c("foo", "foo", "bar"))
  res <- left_join(x, y, by = c("x1" = "y1"))
  expect_equal(names(res), c("x1", "y2"))
  expect_equal(res$x1, c(1, 3, 2))
  expect_equal(res$y2, c("foo", "bar", "foo"))
})

test_that("joins support complex vectors", {
  a <- data.frame(x = c(1, 1, 2, 3) * 1i, y = 1:4)
  b <- data.frame(x = c(1, 2, 2, 4) * 1i, z = 1:4)
  j <- inner_join(a, b, "x")

  expect_equal(names(j), c("x", "y", "z"))
  expect_equal(j$y, c(1, 2, 3, 3))
  expect_equal(j$z, c(1, 1, 2, 3))
})

test_that("joins suffix variable names (#655)", {
  a <- data.frame(x = 1:10, y = 2:11)
  b <- data.frame(z = 5:14, x = 3:12) # x from this gets suffixed by .y
  res <- left_join(a, b, by = c("x" = "z"))
  expect_equal(names(res), c("x", "y", "x.y"))

  a <- data.frame(x = 1:10, z = 2:11)
  b <- data.frame(z = 5:14, x = 3:12) # x from this gets suffixed by .y
  res <- left_join(a, b, by = c("x" = "z"))
  expect_equal(names(res), c("x", "z", "x.y"))
})

test_that("right_join gets the column in the right order #96", {
  a <- data.frame(x = 1:10, y = 2:11)
  b <- data.frame(x = 5:14, z = 3:12)
  res <- right_join(a, b)
  expect_equal(names(res), c("x", "y", "z"))

  a <- data.frame(x = 1:10, y = 2:11)
  b <- data.frame(z = 5:14, a = 3:12)
  res <- right_join(a, b, by = c("x" = "z"))
  expect_equal(names(res), c("x", "y", "a"))
})

test_that("full_join #96", {
  a <- data.frame(x = 1:3, y = 2:4)
  b <- data.frame(x = 3:5, z = 3:5)
  res <- full_join(a, b, "x")
  expect_equal(res$x, 1:5)
  expect_equal(res$y[1:3], 2:4)
  expect_true(all(is.na(res$y[4:5])))

  expect_true(all(is.na(res$z[1:2])))
  expect_equal(res$z[3:5], 3:5)
})

test_that("JoinStringFactorVisitor and JoinFactorStringVisitor handle NA #688", {
  x <- data.frame(Greek = c("Alpha", "Beta", NA), numbers = 1:3)
  y <- data.frame(
    Greek = c("Alpha", "Beta", "Gamma"),
    Letters = c("C", "B", "C"),
    stringsAsFactors = F
  )

  expect_warning(
    res <- left_join(x, y, by = "Greek"),
    "Column `Greek` joining factor and character vector, coercing into character vector",
    fixed = TRUE
  )
  expect_true(is.na(res$Greek[3]))
  expect_true(is.na(res$Letters[3]))
  expect_equal(res$numbers, 1:3)

  expect_warning(
    res <- left_join(y, x, by = "Greek"),
    "Column `Greek` joining character vector and factor, coercing into character vector",
    fixed = TRUE
  )
  expect_equal(res$Greek, y$Greek)
  expect_equal(res$Letters, y$Letters)
  expect_equal(res$numbers[1:2], 1:2)
  expect_true(is.na(res$numbers[3]))
})


test_that("JoinFactorFactorVisitor_SameLevels preserve levels order (#675)", {
  input <- data.frame(g1 = factor(c("A", "B", "C"), levels = c("B", "A", "C")))
  output <- data.frame(
    g1 = factor(c("A", "B", "C"), levels = c("B", "A", "C")),
    g2 = factor(c("A", "B", "C"), levels = c("B", "A", "C"))
  )

  res <- inner_join(group_by(input, g1), group_by(output, g1))
  expect_equal(levels(res$g1), levels(input$g1))
  expect_equal(levels(res$g2), levels(output$g2))
})

test_that("inner_join does not reorder (#684)", {
  test <- tibble(Greek = c("Alpha", "Beta", "Gamma"), Letters = LETTERS[1:3])
  lookup <- tibble(Letters = c("C", "B", "C"))
  res <- inner_join(lookup, test)
  expect_equal(res$Letters, c("C", "B", "C"))
})

test_that("joins coerce factors with different levels to character (#684)", {
  d1 <- tibble(a = factor(c("a", "b", "c")))
  d2 <- tibble(a = factor(c("a", "e")))
  expect_warning(res <- inner_join(d1, d2))
  expect_is(res$a, "character")

  # different orders
  d2 <- d1
  attr(d2$a, "levels") <- c("c", "b", "a")
  expect_warning(res <- inner_join(d1, d2))
  expect_is(res$a, "character")
})

test_that("joins between factor and character coerces to character with a warning (#684)", {
  d1 <- tibble(a = factor(c("a", "b", "c")))
  d2 <- tibble(a = c("a", "e"))
  expect_warning(res <- inner_join(d1, d2))
  expect_is(res$a, "character")

  expect_warning(res <- inner_join(d2, d1))
  expect_is(res$a, "character")
})

test_that("group column names reflect renamed duplicate columns (#2330)", {
  d1 <- tibble(x = 1:5, y = 1:5) %>% group_by(x, y)
  d2 <- tibble(x = 1:5, y = 1:5)
  res <- inner_join(d1, d2, by = "x")
  expect_groups(d1, c("x", "y"))
  expect_groups(res, c("x", "y.x"))
})

test_that("group column names are null when joined data frames are not grouped (#2330)", {
  d1 <- tibble(x = 1:5, y = 1:5)
  d2 <- tibble(x = 1:5, y = 1:5)
  res <- inner_join(d1, d2, by = "x")
  expect_no_groups(res)
})

# Guessing variables in x and y ------------------------------------------------

test_that("unnamed vars are the same in both tables", {
  by1 <- common_by_from_vector(c("x", "y", "z"))
  expect_equal(by1$x, c("x", "y", "z"))
  expect_equal(by1$y, c("x", "y", "z"))

  by2 <- common_by_from_vector(c("x" = "a", "y", "z"))
  expect_equal(by2$x, c("x", "y", "z"))
  expect_equal(by2$y, c("a", "y", "z"))
})

test_that("join columns are not moved to the left (#802)", {
  df1 <- data.frame(x = 1, y = 1:5)
  df2 <- data.frame(y = 1:5, z = 2)

  out <- left_join(df1, df2)
  expect_equal(names(out), c("x", "y", "z"))
})

test_that("join can handle multiple encodings (#769)", {
  text <- c("\xC9lise", "Pierre", "Fran\xE7ois")
  Encoding(text) <- "latin1"
  x <- tibble(name = text, score = c(5, 7, 6))
  y <- tibble(name = text, attendance = c(8, 10, 9))
  res <- left_join(x, y, by = "name")
  expect_equal(nrow(res), 3L)
  expect_equal(res$name, x$name)

  x <- tibble(name = factor(text), score = c(5, 7, 6))
  y <- tibble(name = text, attendance = c(8, 10, 9))
  res <- suppressWarnings(left_join(x, y, by = "name"))
  expect_equal(nrow(res), 3L)
  expect_equal(res$name, y$name)

  x <- tibble(name = text, score = c(5, 7, 6))
  y <- tibble(name = factor(text), attendance = c(8, 10, 9))
  res <- suppressWarnings(left_join(x, y, by = "name"))
  expect_equal(nrow(res), 3L)
  expect_equal(res$name, x$name)

  x <- tibble(name = factor(text), score = c(5, 7, 6))
  y <- tibble(name = factor(text), attendance = c(8, 10, 9))
  res <- suppressWarnings(left_join(x, y, by = "name"))
  expect_equal(nrow(res), 3L)
  expect_equal(res$name, x$name)
})

test_that("join creates correctly named results (#855)", {
  x <- data.frame(q = c("a", "b", "c"), r = c("d", "e", "f"), s = c("1", "2", "3"))
  y <- data.frame(q = c("a", "b", "c"), r = c("d", "e", "f"), t = c("xxx", "xxx", "xxx"))
  res <- left_join(x, y, by = c("r", "q"))
  expect_equal(names(res), c("q", "r", "s", "t"))
  expect_equal(res$q, x$q)
  expect_equal(res$r, x$r)
})

test_that("inner join gives same result as merge by default (#1281)", {
  set.seed(75)
  x <- data.frame(
    cat1 = sample(c("A", "B", NA), 5, 1),
    cat2 = sample(c(1, 2, NA), 5, 1), v = rpois(5, 3),
    stringsAsFactors = FALSE
  )
  y <- data.frame(
    cat1 = sample(c("A", "B", NA), 5, 1),
    cat2 = sample(c(1, 2, NA), 5, 1), v = rpois(5, 3),
    stringsAsFactors = FALSE
  )
  ij <- inner_join(x, y, by = c("cat1", "cat2"))
  me <- merge(x, y, by = c("cat1", "cat2"))
  expect_true(equal_data_frame(ij, me))
})

test_that("join handles matrices #1230", {
  df1 <- tibble(x = 1:10, text = letters[1:10])
  df2 <- tibble(x = 1:5, text = "")
  df2$text <- matrix(LETTERS[1:10], nrow = 5)

  res <- left_join(df1, df2, by = c("x" = "x")) %>% filter(x > 5)
  text.y <- res$text.y
  expect_true(is.matrix(text.y))
  expect_equal(dim(text.y), c(5, 2))
  expect_true(all(is.na(text.y)))
})

test_that("ordering of strings is not confused by R's collate order (#1315)", {
  a <- data.frame(character = c("\u0663"), set = c("arabic_the_language"), stringsAsFactors = F)
  b <- data.frame(character = c("3"), set = c("arabic_the_numeral_set"), stringsAsFactors = F)
  res <- b %>% inner_join(a, by = c("character"))
  expect_equal(nrow(res), 0L)
  res <- a %>% inner_join(b, by = c("character"))
  expect_equal(nrow(res), 0L)
})

test_that("joins handle tzone differences (#819)", {
  date1 <- structure(-1735660800, tzone = "America/Chicago", class = c("POSIXct", "POSIXt"))
  date2 <- structure(-1735660800, tzone = "UTC", class = c("POSIXct", "POSIXt"))

  df1 <- data.frame(date = date1)
  df2 <- data.frame(date = date2)

  expect_equal(attr(left_join(df1, df1)$date, "tzone"), "America/Chicago")
})

test_that("joins matches NA in character vector by default (#892, #2033)", {
  x <- data.frame(
    id = c(NA_character_, NA_character_),
    stringsAsFactors = F
  )

  y <- expand.grid(
    id = c(NA_character_, NA_character_),
    LETTER = LETTERS[1:2],
    stringsAsFactors = F
  )

  res <- left_join(x, y, by = "id")
  expect_true(all(is.na(res$id)))
  expect_equal(res$LETTER, rep(rep(c("A", "B"), each = 2), 2))
})

test_that("joins avoid name repetition (#1460)", {
  d1 <- data.frame(id = 1:5, foo = rnorm(5))
  d2 <- data.frame(id = 1:5, foo = rnorm(5))
  d3 <- data.frame(id = 1:5, foo = rnorm(5))
  d <- d1 %>%
    left_join(d1, by = "id") %>%
    left_join(d2, by = "id") %>%
    left_join(d3, by = "id")
  expect_equal(names(d), c("id", "foo.x", "foo.y", "foo.x.x", "foo.y.y"))
})

test_that("join functions are protected against empty by (#1496)", {
  x <- data.frame()
  y <- data.frame(a = 1)
  expect_error(
    left_join(x, y, by = names(x)),
    "`by` must specify variables to join by",
    fixed = TRUE
  )
  expect_error(
    right_join(x, y, by = names(x)),
    "`by` must specify variables to join by",
    fixed = TRUE
  )
  expect_error(
    semi_join(x, y, by = names(x)),
    "`by` must specify variables to join by",
    fixed = TRUE
  )
  expect_error(
    full_join(x, y, by = names(x)),
    "`by` must specify variables to join by",
    fixed = TRUE
  )
  expect_error(
    anti_join(x, y, by = names(x)),
    "`by` must specify variables to join by",
    fixed = TRUE
  )
  expect_error(
    inner_join(x, y, by = names(x)),
    "`by` must specify variables to join by",
    fixed = TRUE
  )
})

test_that("joins takes care of duplicates in by (#1192)", {
  data2 <- tibble(a = 1:3)
  data1 <- tibble(a = 1:3, c = 3:5)

  res1 <- left_join(data1, data2, by = c("a", "a"))
  res2 <- left_join(data1, data2, by = c("a" = "a"))
  expect_equal(res1, res2)
})

# Joined columns result in correct type ----------------------------------------

test_that("result of joining POSIXct is POSIXct (#1578)", {
  data1 <- tibble(
    t = seq(as.POSIXct("2015-12-01", tz = "UTC"), length.out = 2, by = "days"),
    x = 1:2
  )
  data2 <- inner_join(data1, data1, by = "t")
  res1 <- class(data2$t)
  expected <- c("POSIXct", "POSIXt")
  expect_identical(res1, expected)
})

test_that("joins allows extra attributes if they are identical (#1636)", {
  tbl_left <- tibble(
    i = rep(c(1, 2, 3), each = 2),
    x1 = letters[1:6]
  )
  tbl_right <- tibble(
    i = c(1, 2, 3),
    x2 = letters[1:3]
  )

  attr(tbl_left$i, "label") <- "iterator"
  attr(tbl_right$i, "label") <- "iterator"

  res <- left_join(tbl_left, tbl_right, by = "i")
  expect_equal(attr(res$i, "label"), "iterator")

  attr(tbl_left$i, "foo") <- "bar"
  attributes(tbl_right$i) <- NULL
  attr(tbl_right$i, "foo") <- "bar"
  attr(tbl_right$i, "label") <- "iterator"

  res <- left_join(tbl_left, tbl_right, by = "i")
  expect_equal(attr(res$i, "label"), "iterator")
  expect_equal(attr(res$i, "foo"), "bar")
})

test_that("joins work with factors of different levels (#1712)", {
  d1 <- iris[, c("Species", "Sepal.Length")]
  d2 <- iris[, c("Species", "Sepal.Width")]
  d2$Species <- factor(as.character(d2$Species), levels = rev(levels(d1$Species)))
  expect_warning(res1 <- left_join(d1, d2, by = "Species"))

  d1$Species <- as.character(d1$Species)
  d2$Species <- as.character(d2$Species)
  res2 <- left_join(d1, d2, by = "Species")
  expect_equal(res1, res2)
})

test_that("anti and semi joins give correct result when by variable is a factor (#1571)", {
  big <- data.frame(letter = rep(c("a", "b"), each = 2), number = 1:2)
  small <- data.frame(letter = "b")
  expect_warning(
    aj_result <- anti_join(big, small, by = "letter"),
    "Column `letter` joining factors with different levels, coercing to character vector",
    fixed = TRUE
  )
  expect_equal(aj_result$number, 1:2)
  expect_equal(aj_result$letter, factor(c("a", "a"), levels = c("a", "b")))

  expect_warning(
    sj_result <- semi_join(big, small, by = "letter"),
    "Column `letter` joining factors with different levels, coercing to character vector",
    fixed = TRUE
  )
  expect_equal(sj_result$number, 1:2)
  expect_equal(sj_result$letter, factor(c("b", "b"), levels = c("a", "b")))
})

test_that("inner join not crashing (#1559)", {
  df3 <- tibble(
    id = c(102, 102, 102, 121),
    name = c("qwer", "qwer", "qwer", "asdf"),
    k = factor(c("one", "two", "total", "one"), levels = c("one", "two", "total")),
    total = factor(c("tot", "tot", "tot", "tot"), levels = c("tot", "plan", "fact")),
    v = c(NA_real_, NA_real_, NA_real_, NA_real_),
    btm = c(25654.957609, 29375.7547216667, 55030.7123306667, 10469.3523273333),
    top = c(22238.368946, 30341.516924, 52579.88587, 9541.893144)
  )
  df4 <- tibble(
    id = c(102, 102, 102, 121),
    name = c("qwer", "qwer", "qwer", "asdf"),
    k = factor(c("one", "two", "total", "one"), levels = c("one", "two", "total")),
    type = factor(c("fact", "fact", "fact", "fact"), levels = c("tot", "plan", "fact")),
    perc = c(0.15363485835208, -0.0318297270618471, 0.0466114830816894, 0.0971986553754823)
  )
  # all we want here is to test that this does not crash
  expect_message(res <- replicate(100, df3 %>% inner_join(df4)))
  for (i in 2:100) expect_equal(res[, 1], res[, i])
})


# Encoding ----------------------------------------------------------------

test_that("join handles mix of encodings in data (#1885, #2118, #2271)", {
  with_non_utf8_encoding({
    special <- get_native_lang_string()

    for (factor1 in c(FALSE, TRUE)) {
      for (factor2 in c(FALSE, TRUE)) {
        for (encoder1 in c(enc2native, enc2utf8)) {
          for (encoder2 in c(enc2native, enc2utf8)) {
            df1 <- data.frame(x = encoder1(special), y = 1, stringsAsFactors = factor1)
            df1 <- tbl_df(df1)
            df2 <- data.frame(x = encoder2(special), z = 2, stringsAsFactors = factor2)
            df2 <- tbl_df(df2)
            df <- data.frame(x = special, y = 1, z = 2, stringsAsFactors = factor1 && factor2)
            df <- tbl_df(df)

            info <- paste(
              factor1,
              factor2,
              Encoding(as.character(df1$x)),
              Encoding(as.character(df2$x))
            )

            if (factor1 != factor2) {
              warning_msg <- "coercing"
            } else {
              warning_msg <- NA
            }

            expect_warning_msg <- function(code, msg = warning_msg) {
              expect_warning(
                code, msg,
                info = paste(deparse(substitute(code)[[2]][[1]]), info)
              )
            }

            expect_equal_df <- function(code, df_ = df) {
              code <- substitute(code)
              eval(bquote(
                expect_equal(
                  .(code), df_,
                  info = paste(deparse(code[[1]]), info)
                )
              ))
            }

            expect_warning_msg(expect_equal_df(inner_join(df1, df2, by = "x")))
            expect_warning_msg(expect_equal_df(left_join(df1, df2, by = "x")))
            expect_warning_msg(expect_equal_df(right_join(df1, df2, by = "x")))
            expect_warning_msg(expect_equal_df(full_join(df1, df2, by = "x")))
            expect_warning_msg(
              expect_equal_df(
                semi_join(df1, df2, by = "x"),
                data.frame(x = special, y = 1, stringsAsFactors = factor1)
              )
            )
            expect_warning_msg(
              expect_equal_df(
                anti_join(df1, df2, by = "x"),
                data.frame(x = special, y = 1, stringsAsFactors = factor1)[0, ]
              )
            )
          }
        }
      }
    }
  })
})

test_that("left_join handles mix of encodings in column names (#1571)", {
  with_non_utf8_encoding({
    special <- get_native_lang_string()

    df1 <- tibble(x = 1:6, foo = 1:6)
    names(df1)[1] <- special

    df2 <- tibble(x = 1:6, baz = 1:6)
    names(df2)[1] <- enc2native(special)

    expect_message(res <- left_join(df1, df2), special, fixed = TRUE)
    expect_equal(names(res), c(special, "foo", "baz"))
    expect_equal(res$foo, 1:6)
    expect_equal(res$baz, 1:6)
    expect_equal(res[[special]], 1:6)
  })
})

# Misc --------------------------------------------------------------------

test_that("NAs match in joins only with na_matches = 'na' (#2033)", {
  df1 <- tibble(a = NA)
  df2 <- tibble(a = NA, b = 1:3)
  for (na_matches in c("na", "never")) {
    accept_na_match <- (na_matches == "na")
    expect_equal(inner_join(df1, df2, na_matches = na_matches) %>% nrow(), 0 + 3 * accept_na_match)
    expect_equal(left_join(df1, df2, na_matches = na_matches) %>% nrow(), 1 + 2 * accept_na_match)
    expect_equal(right_join(df2, df1, na_matches = na_matches) %>% nrow(), 1 + 2 * accept_na_match)
    expect_equal(full_join(df1, df2, na_matches = na_matches) %>% nrow(), 4 - accept_na_match)
    expect_equal(anti_join(df1, df2, na_matches = na_matches) %>% nrow(), 1 - accept_na_match)
    expect_equal(semi_join(df1, df2, na_matches = na_matches) %>% nrow(), 0 + accept_na_match)
  }
})

test_that("joins regroups (#1597, #3566)", {
  df1 <- tibble(a = 1:3) %>% group_by(a)
  df2 <- tibble(a = rep(1:4, 2)) %>% group_by(a)

  expect_grouped <- function(df) {
    expect_true(is_grouped_df(df))
  }

  expect_grouped(inner_join(df1, df2))
  expect_grouped(left_join(df1, df2))
  expect_grouped(right_join(df2, df1))
  expect_grouped(full_join(df1, df2))
  expect_grouped(anti_join(df1, df2))
  expect_grouped(semi_join(df1, df2))
})


test_that("join accepts tz attributes (#2643)", {
  # It's the same time:
  df1 <- tibble(a = as.POSIXct("2009-01-01 10:00:00", tz = "Europe/London"))
  df2 <- tibble(a = as.POSIXct("2009-01-01 11:00:00", tz = "Europe/Paris"))
  result <- inner_join(df1, df2, by = "a")
  expect_equal(nrow(result), 1)
})

test_that("join takes LHS with warning if attributes inconsistent", {
  df1 <- tibble(a = 1:2, b = 2:1)
  df2 <- tibble(
    a = structure(1:2, foo = "bar"),
    c = 2:1
  )

  expect_warning(
    out1 <- left_join(df1, df2, by = "a"),
    "Column `a` has different attributes on LHS and RHS of join"
  )
  expect_warning(out2 <- left_join(df2, df1, by = "a"))
  expect_warning(
    out3 <- left_join(df1, df2, by = c("b" = "a")),
    "Column `b`/`a` has different attributes on LHS and RHS of join"
  )

  expect_equal(attr(out1$a, "foo"), NULL)
  expect_equal(attr(out2$a, "foo"), "bar")
})

test_that("common_by() message", {
  df <- tibble(!!!set_names(letters, letters))

  expect_message(
    left_join(df, df %>% select(1)),
    'Joining, by = "a"',
    fixed = TRUE
  )

  expect_message(
    left_join(df, df %>% select(1:3)),
    'Joining, by = c("a", "b", "c")',
    fixed = TRUE
  )

  expect_message(
    left_join(df, df),
    paste0("Joining, by = c(", paste0('"', letters, '"', collapse = ", "), ")"),
    fixed = TRUE
  )
})

test_that("semi- and anti-joins preserve order (#2964)", {
  expect_identical(
    tibble(a = 3:1) %>% semi_join(tibble(a = 1:3)),
    tibble(a = 3:1)
  )
  expect_identical(
    tibble(a = 3:1) %>% anti_join(tibble(a = 4:6)),
    tibble(a = 3:1)
  )
})

test_that("join handles raw vectors", {
  df1 <- tibble(r = as.raw(1:4), x = 1:4)
  df2 <- tibble(r = as.raw(3:6), y = 3:6)

  expect_identical(
    left_join(df1, df2, by = "r"),
    tibble(r = as.raw(1:4), x = 1:4, y = c(NA, NA, 3:4))
  )

  expect_identical(
    right_join(df1, df2, by = "r"),
    tibble(r = as.raw(3:6), x = c(3:4, NA, NA), y = c(3:6))
  )

  expect_identical(
    full_join(df1, df2, by = "r"),
    tibble(r = as.raw(1:6), x = c(1:4, NA, NA), y = c(NA, NA, 3:6))
  )

  expect_identical(
    inner_join(df1, df2, by = "r"),
    tibble(r = as.raw(3:4), x = c(3:4), y = c(3:4))
  )
})

test_that("nest_join works (#3570)",{
  df1 <- tibble(x = c(1, 2), y = c(2, 3))
  df2 <- tibble(x = c(1, 1), z = c(2, 3))
  res <- nest_join(df1, df2, by = "x")
  expect_equal(names(res), c(names(df1), "df2"))
  expect_identical(res$df2[[1]], select(df2, z))
  expect_identical(res$df2[[2]], tibble(z = double()))
})

test_that("nest_join handles multiple matches in x (#3642)", {
  df1 <- tibble(x = c(1, 1))
  df2 <- tibble(x = 1, y = 1:2)

  tbls <- df1 %>%
    nest_join(df2) %>%
    pull()

  expect_identical(tbls[[1]], tbls[[2]])
})

test_that("joins reject data frames with duplicate columns (#3243)", {
  df1 <- data.frame(x1 = 1:3, x2 = 1:3, y = 1:3)
  names(df1)[1:2] <- "x"
  df2 <- data.frame(x = 2:4, y = 2:4)

  expect_error(
    left_join(df1, df2, by = c("x", "y")),
    "name",
    fixed = TRUE
  )

  expect_error(
    left_join(df2, df1, by = c("x", "y")),
    "Column `x` must have a unique name",
    fixed = TRUE
  )

  expect_error(
    right_join(df1, df2, by = c("x", "y")),
    "name",
    fixed = TRUE
  )

  expect_error(
    right_join(df2, df1, by = c("x", "y")),
    "Column `x` must have a unique name",
    fixed = TRUE
  )

  expect_error(
    inner_join(df1, df2, by = c("x", "y")),
    "name",
    fixed = TRUE
  )

  expect_error(
    inner_join(df2, df1, by = c("x", "y")),
    "Column `x` must have a unique name",
    fixed = TRUE
  )

  expect_error(
    full_join(df1, df2, by = c("x", "y")),
    "name",
    fixed = TRUE
  )

  expect_error(
    full_join(df2, df1, by = c("x", "y")),
    "Column `x` must have a unique name",
    fixed = TRUE
  )

  expect_error(
    semi_join(df1, df2, by = c("x", "y")),
    "name",
    fixed = TRUE
  )

  # FIXME: Compatibility, should throw an error eventually
  expect_warning(
    expect_equal(
      semi_join(df2, df1, by = c("x", "y")),
      data.frame(x = 2:3, y = 2:3)
    ),
    "Column `x` must have a unique name",
    fixed = TRUE
  )

  expect_error(
    anti_join(df1, df2, by = c("x", "y")),
    "name",
    fixed = TRUE
  )

  # FIXME: Compatibility, should throw an error eventually
  expect_warning(
    expect_equal(
      anti_join(df2, df1, by = c("x", "y")),
      data.frame(x = 4L, y = 4L)
    ),
    "Column `x` must have a unique name",
    fixed = TRUE
  )
})

test_that("joins reject data frames with NA columns (#3417)", {
  df_a <- tibble(B = c("a", "b", "c"), AA = 1:3)
  df_b <- tibble(AA = 2:4, C = c("aa", "bb", "cc"))

  df_aa <- df_a
  attr(df_aa, "names") <- c(NA, "AA")
  df_ba <- df_b
  attr(df_ba, "names") <- c("AA", NA)

  expect_error(
    left_join(df_aa, df_b),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    left_join(df_aa, df_ba),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    left_join(df_a, df_ba),
    "Column `2` cannot have NA as name",
    fixed = TRUE
  )

  expect_error(
    right_join(df_aa, df_b),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    right_join(df_aa, df_ba),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    right_join(df_a, df_ba),
    "Column `2` cannot have NA as name",
    fixed = TRUE
  )

  expect_error(
    inner_join(df_aa, df_b),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    inner_join(df_aa, df_ba),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    inner_join(df_a, df_ba),
    "Column `2` cannot have NA as name",
    fixed = TRUE
  )

  expect_error(
    full_join(df_aa, df_b),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    full_join(df_aa, df_ba),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_error(
    full_join(df_a, df_ba),
    "Column `2` cannot have NA as name",
    fixed = TRUE
  )

  expect_warning(
    semi_join(df_aa, df_b),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_warning(
    semi_join(df_aa, df_ba),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_warning(
    semi_join(df_a, df_ba),
    "Column `2` cannot have NA as name",
    fixed = TRUE
  )

  expect_warning(
    anti_join(df_aa, df_b),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_warning(
    anti_join(df_aa, df_ba),
    "Column `1` cannot have NA as name",
    fixed = TRUE
  )
  expect_warning(
    anti_join(df_a, df_ba),
    "Column `2` cannot have NA as name",
    fixed = TRUE
  )
})

Try the dplyr package in your browser

Any scripts or data that you put into this service are public.

dplyr documentation built on July 4, 2019, 5:08 p.m.