tests/testthat/test-mutate-windowed.R

context("Mutate - windowed")

test_that("mutate calls windowed versions of sql functions", {
  test_f <- function(tbl) {
    res <- tbl %>%
      group_by(g) %>%
      mutate(r = as.numeric(row_number(x))) %>%
      collect()
    expect_equal(res$r, c(1, 2, 1, 2))
  }

  df <- data_frame(x = 1:4, g = rep(c(1, 2), each = 2))
  tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
  tbls %>% lapply(test_f)
})

test_that("recycled aggregates generate window function", {
  test_f <- function(tbl) {
    res <- tbl %>%
      group_by(g) %>%
      mutate(r = x > mean(x)) %>%
      collect()
    expect_equal(res$r, c(FALSE, TRUE, FALSE, TRUE))
  }

  df <- data_frame(x = 1:4, g = rep(c(1, 2), each = 2))
  tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
  tbls %>% lapply(test_f)
})

test_that("cumulative aggregates generate window function", {
  test_f <- function(tbl) {
    res <- tbl %>%
      group_by(g) %>%
      arrange(x) %>%
      mutate(r = cumsum(x)) %>%
      collect()
    expect_equal(res$r, c(1, 3, 3, 7))
  }

  df <- data_frame(x = 1:4, g = rep(c(1, 2), each = 2))
  tbls <- test_load(df, ignore = "sqlite") # SQLite doesn't support window functions
  tbls %>% lapply(test_f)
})

test_that("desc is correctly handled by window functions", {
  df <- data.frame(x = 1:10, y = seq(1,10,by=1), g = rep(c(1, 2), each = 5))

  expect_equal(mutate(df, rank=min_rank(desc(x)) )$rank, 10:1 )
  expect_equal(mutate(group_by(df,g), rank=min_rank(desc(x)))$rank, rep(5:1,2) )

  expect_equal(mutate(df, rank=row_number(desc(x)) )$rank, 10:1 )
  expect_equal(mutate(group_by(df,g), rank=row_number(desc(x)))$rank, rep(5:1,2) )

})

test_that("row_number gives correct results",{
  tmp <- data.frame(id=rep(c(1,2),each=5),value=c(1,1,2,5,0,6,4,0,0,2))
  res <- group_by(tmp, id) %>% mutate(var=row_number(value))
  expect_equal(res$var, c(2,3,4,5,1,5,4,1,2,3))
})

test_that("row_number works with 0 arguments", {
  g <- group_by(mtcars, cyl)
  expect_equal( mutate( g, rn = row_number() ), mutate( g, rn = 1:n() ) )
})

test_that("cum(sum,min,max) works", {
  df <- data.frame(x = 1:10, y = seq(1,10,by=1), g = rep(c(1, 2), each = 5))

  res <- mutate( df,
    csumx = cumsum(x), csumy = cumsum(y),
    cminx = cummin(x), cminy = cummin(y),
    cmaxx = cummax(x), cmaxy = cummax(y)
  )
  expect_equal( res$csumx, cumsum(df$x) )
  expect_equal( res$csumy, cumsum(df$y) )
  expect_equal( res$cminx, cummin(df$x) )
  expect_equal( res$cminy, cummin(df$y) )
  expect_equal( res$cmaxx, cummax(df$x) )
  expect_equal( res$cmaxy, cummax(df$y) )

  res <- mutate( group_by(df,g) ,
    csumx = cumsum(x), csumy = cumsum(y),
    cminx = cummin(x), cminy = cummin(y),
    cmaxx = cummax(x), cmaxy = cummax(y)
    )
  expect_equal( res$csumx, c( cumsum(df$x[1:5]), cumsum(df$x[6:10]) ) )
  expect_equal( res$csumy, c( cumsum(df$y[1:5]), cumsum(df$y[6:10]) ) )
  expect_equal( res$cminx, c( cummin(df$x[1:5]), cummin(df$x[6:10]) ) )
  expect_equal( res$cminy, c( cummin(df$y[1:5]), cummin(df$y[6:10]) ) )
  expect_equal( res$cmaxx, c( cummax(df$x[1:5]), cummax(df$x[6:10]) ) )
  expect_equal( res$cmaxy, c( cummax(df$y[1:5]), cummax(df$y[6:10]) ) )

  df$x[3] <- NA
  df$y[4] <- NA
  res <- mutate( df,
    csumx = cumsum(x), csumy = cumsum(y),
    cminx = cummin(x), cminy = cummin(y),
    cmaxx = cummax(x), cmaxy = cummax(y)
  )
  expect_true( all(is.na(res$csumx[3:10])) )
  expect_true( all(is.na(res$csumy[4:10])) )

  expect_true( all(is.na(res$cminx[3:10])) )
  expect_true( all(is.na(res$cminy[4:10])) )

  expect_true( all(is.na(res$cmaxx[3:10])) )
  expect_true( all(is.na(res$cmaxy[4:10])) )

})

test_that( "lead and lag simple hybrid version gives correct results (#133)", {
  res <- group_by(mtcars, cyl) %>%
    mutate( disp_lag_2 = lag(disp, 2), disp_lead_2 = lead(disp, 2) ) %>%
    summarise(
        lag1  = all(is.na(head(disp_lag_2, 2))),
        lag2  = all(!is.na(tail(disp_lag_2, -2))),

        lead1 = all(is.na(tail(disp_lead_2, 2))),
        lead2 = all(!is.na(head(disp_lead_2, -2)))

    )
  expect_true( all(res$lag1) )
  expect_true( all(res$lag2) )

  expect_true( all(res$lead1) )
  expect_true( all(res$lead2) )
})

test_that("min_rank handles columns full of NaN (#726)", {
  test <- data.frame(
    Name = c("a", "b","c", "d", "e"),
    ID = c(1, 1, 1, 1, 1),
    expression = c(NaN, NaN, NaN, NaN, NaN)
  )
  data <- group_by(test, ID) %>% mutate(rank = min_rank(expression) )
  expect_true( all(is.na(data$rank)) )
})

test_that("rank functions deal correctly with NA (#774)", {
  data <- data_frame( x = c(1,2,NA,1,0,NA) )
  res <- data %>% mutate(
    min_rank = min_rank(x),
    percent_rank = percent_rank(x),
    dense_rank = dense_rank(x),
    cume_dist = cume_dist(x),
    ntile = ntile(x,2),
    row_number = row_number(x)
  )
  expect_true( all( is.na( res$min_rank[c(3,6)] ) ) )
  expect_true( all( is.na( res$dense_rank[c(3,6)] ) ) )
  expect_true( all( is.na( res$percent_rank[c(3,6)] ) ) )
  expect_true( all( is.na( res$cume_dist[c(3,6)] ) ) )
  expect_true( all( is.na( res$ntile[c(3,6)] ) ) )
  expect_true( all( is.na( res$row_number[c(3,6)] ) ) )

  expect_equal( res$percent_rank[ c(1,2,4,5) ], c(1/3, 1, 1/3, 0 ) )
  expect_equal( res$min_rank[ c(1,2,4,5) ], c(2L,4L,2L,1L) )
  expect_equal( res$dense_rank[ c(1,2,4,5) ], c(2L,3L,2L,1L) )
  expect_equal( res$cume_dist[ c(1,2,4,5) ], c(.75,1,.75,.25) )
  expect_equal( res$ntile[ c(1,2,4,5) ], c(1L,2L,2L,1L) )
  expect_equal( res$row_number[ c(1,2,4,5) ], c(2L,4L,3L,1L) )

  data <- data_frame(
    x = rep(c(1,2,NA,1,0,NA), 2),
    g = rep(c(1,2), each = 6)
  )
  res <- data %>%
    group_by(g) %>%
    mutate(
      min_rank = min_rank(x),
      percent_rank = percent_rank(x),
      dense_rank = dense_rank(x),
      cume_dist = cume_dist(x),
      ntile = ntile(x,2),
      row_number = row_number(x)
    )
  expect_true( all( is.na( res$min_rank[c(3,6,9,12)] ) ) )
  expect_true( all( is.na( res$dense_rank[c(3,6,9,12)] ) ) )
  expect_true( all( is.na( res$percent_rank[c(3,6,9,12)] ) ) )
  expect_true( all( is.na( res$cume_dist[c(3,6,9,12)] ) ) )
  expect_true( all( is.na( res$ntile[c(3,6,9,12)] ) ) )
  expect_true( all( is.na( res$row_number[c(3,6,9,12)] ) ) )

  expect_equal( res$percent_rank[ c(1,2,4,5,7,8,10,11) ], rep(c(1/3, 1, 1/3, 0 ), 2) )
  expect_equal( res$min_rank[ c(1,2,4,5,7,8,10,11) ], rep(c(2L,4L,2L,1L), 2) )
  expect_equal( res$dense_rank[ c(1,2,4,5,7,8,10,11) ], rep(c(2L,3L,2L,1L), 2) )
  expect_equal( res$cume_dist[ c(1,2,4,5,7,8,10,11) ], rep(c(.75,1,.75,.25), 2) )
  expect_equal( res$ntile[ c(1,2,4,5,7,8,10,11) ], rep(c(1L,2L,2L,1L), 2) )
  expect_equal( res$row_number[ c(1,2,4,5,7,8,10,11) ], rep(c(2L,4L,3L,1L), 2 ) )

})

test_that("lag and lead work on factors inside mutate (#955)", {
  test_factor <- factor(rep(c('A','B','C'), each = 3))
  exp_lag  <- test_factor != lag(test_factor)
  exp_lead <- test_factor != lead(test_factor)

  test_df <- tbl_df(data.frame(test = test_factor))
  res <- test_df %>% mutate(
    is_diff_lag  = (test != lag(test)),
    is_diff_lead = (test != lead(test))
    )
  expect_equal( exp_lag , res$is_diff_lag )
  expect_equal( exp_lead, res$is_diff_lead)

})

test_that("lag handles default argument in mutate (#915)", {
  blah <- data.frame(x1 = c(5,10,20,27,35,58,5,6), y = 8:1)
  blah <- mutate(blah,
    x2 = x1 - lag(x1, n=1, default=0),
    x3 = x1 - lead(x1, n=1, default=0),
    x4 = lag(x1, n=1L, order_by = y),
    x5 = lead(x1, n=1L, order_by = y)
  )
  expect_equal( blah$x2, blah$x1 - lag(blah$x1, n = 1, default = 0))
  expect_equal( blah$x3, blah$x1 - lead(blah$x1, n = 1, default = 0))
  expect_equal( blah$x4, lag( blah$x1, n=1L, order_by = blah$y ) )
  expect_equal( blah$x5, lead( blah$x1, n=1L, order_by = blah$y ) )
})

# FIXME: this should only fail if strict checking is on.
# test_that("window functions fail if db doesn't support windowing", {
#   df_sqlite <- temp_load(temp_srcs("sqlite"), df)$sql %>% group_by(g)
#   ok <- collect(df_sqlite %>% mutate(x > 4))
#   expect_equal(nrow(ok), 10)
#
#   expect_error(df_sqlite %>% mutate(x > mean(x)), "does not support")
#   expect_error(df_sqlite %>% mutate(r = row_number()), "does not support")
# })
sctyner/dplyr050 documentation built on May 17, 2019, 2:22 p.m.