replyr: Patches to Use 'dplyr' on Remote Data Sources

union_all issue with SQLite. Submitted as dplyr issue 2270.

suppressPackageStartupMessages(library('dplyr'))
packageVersion('dplyr')
 #  [1] '0.7.0'
packageVersion('dbplyr')
 #  [1] '1.0.0'
my_db <- dplyr::src_sqlite(":memory:", create = TRUE)
dr <- dplyr::copy_to(my_db,
                     data.frame(x=c(1,2), y=c('a','b'),
                                stringsAsFactors = FALSE),
                     'dr',
                     overwrite=TRUE)
dr <- head(dr,1)
# dr <- compute(dr)
print(dr)
 #  # Source:   lazy query [?? x 2]
 #  # Database: sqlite 3.11.1 [:memory:]
 #        x     y
 #    <dbl> <chr>
 #  1     1     a
print(dplyr::union_all(dr,dr))
 #  Error: SQLite does not support set operations on LIMITs

Filed as RSQLite 215 and dplyr 2858.

rm(list=ls())
gc()
 #            used (Mb) gc trigger (Mb) max used (Mb)
 #  Ncells  620852 33.2    1168576 62.5   940480 50.3
 #  Vcells 1095786  8.4    2060183 15.8  1388772 10.6

Note calling compute doesn't always fix the problem in my more complicated production example. Also union seems to not have the same issue as union_all. It also seems like nested function calls exacerbating the issue, perhaps a reference to a necissary structure goes out of scope and allows sub-table collection too soon? To trigger the full error in replyr force use of union_all in replyr_bind_rows and then try knitting basicChecksSpark200.Rmd.

The following now works:

suppressPackageStartupMessages(library('dplyr'))
suppressPackageStartupMessages(library('sparklyr'))
packageVersion('dplyr')
 #  [1] '0.7.0'
packageVersion('dbplyr')
 #  [1] '1.0.0'
packageVersion('sparklyr')
 #  [1] '0.5.6'
my_db <- sparklyr::spark_connect(version='2.0.0', 
   master = "local")
class(my_db)
 #  [1] "spark_connection"       "spark_shell_connection" "DBIConnection"
my_db$spark_home
 #  [1] "/Users/johnmount/Library/Caches/spark/spark-2.0.0-bin-hadoop2.7"
da <- dplyr::copy_to(my_db,
                     data.frame(x=c(1,2),y=c('a','b'),
                                stringsAsFactors = FALSE),
                     'da',
                     overwrite=TRUE)
da <- head(da,1)
print(da)
 #  # Source:   lazy query [?? x 2]
 #  # Database: spark_connection
 #        x     y
 #    <dbl> <chr>
 #  1     1     a
db <- dplyr::copy_to(my_db,
                     data.frame(x=c(3,4),y=c('c','d'),
                                stringsAsFactors = FALSE),
                     'db',
                     overwrite=TRUE)
db <- head(db,1)
#da <- compute(da)
db <- compute(db)
print(db)
 #  # Source:   table<xdpgkdmlpt> [?? x 2]
 #  # Database: spark_connection
 #        x     y
 #    <dbl> <chr>
 #  1     3     c
res <- dplyr::union_all(da,db)
res <- dplyr::compute(res)
print(res)
 #  # Source:   table<omfsgngwxz> [?? x 2]
 #  # Database: spark_connection
 #        x     y
 #    <dbl> <chr>
 #  1     1     a
 #  2     3     c
print(da)
 #  # Source:   lazy query [?? x 2]
 #  # Database: spark_connection
 #        x     y
 #    <dbl> <chr>
 #  1     1     a
print(db)
 #  # Source:   table<xdpgkdmlpt> [?? x 2]
 #  # Database: spark_connection
 #        x     y
 #    <dbl> <chr>
 #  1     3     c

rm(list=ls())
gc()
 #            used (Mb) gc trigger (Mb) max used (Mb)
 #  Ncells  751959 40.2    1442291 77.1  1168576 62.5
 #  Vcells 1258380  9.7    2060183 15.8  1793341 13.7

version
 #                 _                           
 #  platform       x86_64-apple-darwin15.6.0   
 #  arch           x86_64                      
 #  os             darwin15.6.0                
 #  system         x86_64, darwin15.6.0        
 #  status                                     
 #  major          3                           
 #  minor          4.0                         
 #  year           2017                        
 #  month          04                          
 #  day            21                          
 #  svn rev        72570                       
 #  language       R                           
 #  version.string R version 3.4.0 (2017-04-21)
 #  nickname       You Stupid Darkness

WinVector/replyr documentation built on Oct. 22, 2020, 8:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

WinVector/replyr
Patches to Use 'dplyr' on Remote Data Sources

issues/union_all_issue.md
In WinVector/replyr: Patches to Use 'dplyr' on Remote Data Sources

R Package Documentation

Browse R Packages

We want your feedback!

WinVector/replyr Patches to Use 'dplyr' on Remote Data Sources

issues/union_all_issue.md In WinVector/replyr: Patches to Use 'dplyr' on Remote Data Sources

R Package Documentation

Browse R Packages

We want your feedback!

WinVector/replyr
Patches to Use 'dplyr' on Remote Data Sources

issues/union_all_issue.md
In WinVector/replyr: Patches to Use 'dplyr' on Remote Data Sources