In WinVector/replyr: Patches to Use 'dplyr' on Remote Data Sources

Check complex join results.

suppressPackageStartupMessages(library('dplyr'))

runJoinExperiment <- function(prefix, sc, eagerCompute, uniqueColumns) {
  names <- paste('t', prefix, 1:10, sep= '_')
  joined <- NULL
  for(ni in names) {
    di <- data.frame(k= 1:3, 
                     v= paste(ni, 1:3, sep= '_'))
    if(uniqueColumns) {
      colnames(di)[[2]] <- paste('y', ni, sep= '_')
    }
    if(!is.null(sc)) {
      ti <- copy_to(sc, di, ni)
    } else {
      ti <- di
    }
    if('NULL' %in% class(joined)) {
      joined <- ti
    } else {
      joined <- left_join(joined, ti, by= 'k')
      if(eagerCompute) {
        joined <- compute(joined)
      }
    }
  }
  compute(joined)
}

# works as expected
runJoinExperiment('inmem', NULL, FALSE, FALSE)

Using RSQLite through dplyr loses columns. This has been submitted as RSQLite issue 214 and dplyr issue 2823.

sc <- src_sqlite(":memory:", create = TRUE)

# throws
tryCatch(
  runJoinExperiment('sqlitea', sc, FALSE, FALSE),
  error = function(e) print(e)
)

# incorrect result (missing columns)
runJoinExperiment('sqliteb', sc, TRUE, FALSE)

Using Spark through sparklyr/dplyr doesn't disambiguate columns as the local process does.

sc <- sparklyr::spark_connect(version='2.0.2', 
   master = "local")

# throws
tryCatch(
  runJoinExperiment('sparka', sc, FALSE, FALSE),
  error = function(e) print(e)
)

# throws
tryCatch(
  runJoinExperiment('sparkb', sc, TRUE, FALSE),
   error = function(e) print(e)
)

We can try this again with unambiguous columns, which works. I am assuming that this is dplyr issue 2773, sparklyr issue 677 .

# throws
runJoinExperiment('spark2a', sc, FALSE, TRUE)

runJoinExperiment('spark2b', sc, TRUE, TRUE)

sparklyr::spark_disconnect(sc)

packageVersion("dplyr")
packageVersion("sparklyr")
if(requireNamespace("dbplyr", quietly = TRUE)) {
  packageVersion("dbplyr")
}
if(requireNamespace("RSQLite", quietly = TRUE)) {
  packageVersion("RSQLite")
}
R.Version()$version.string