inst/doc/dplyrXdf_intro.R

## ---- echo=FALSE, message=FALSE, results="hide"--------------------------
knitr::opts_chunk$set(collapse=TRUE, comment="#>")
options(dplyr.print_min=5L, dplyr.print_max=5L)

cc <- rxSetComputeContext("local")

## ------------------------------------------------------------------------
library(dplyrXdf)  # also loads dplyr
library(nycflights13)

# write the data as an xdf file
flightsXdf <- rxDataStep(flights, "flights.xdf", overwrite=TRUE)

## ------------------------------------------------------------------------
# select the rows
flights_rx1 <- rxDataStep(flightsXdf, outFile="flights_rx1.xdf",
                          rowSelection=month <= 6 & year == 2013,
                          overwrite=TRUE)

# variable transformations
flights_rx2 <- rxDataStep(flights_rx1, outFile="flights_rx2.xdf",
                          transforms=list(dist_km=distance*1.6093,
                                          delay=(arr_delay + dep_delay)/2),
                          overwrite=TRUE)

# convert carrier into a factor variable (or rxSummary will complain)
flights_rx3 <- rxFactors(flights_rx2, factorInfo="carrier",
                         outFile="flights_rx3.xdf", overwrite=TRUE)

# use rxSummary to get the summary table(s) (could also use rxCube twice)
flights_rx4 <- rxSummary(~delay:carrier + dist_km:carrier, data=flights_rx3,
                         summaryStats=c("mean", "sum"))

# extract the desired tables from the rxSummary output
flights_rx4_1 <- flights_rx4$categorical[[1]][c("carrier", "Means")]
names(flights_rx4_1)[2] <- "mean_delay"

flights_rx4_2 <- flights_rx4$categorical[[2]][c("carrier", "Sum")]
names(flights_rx4_2)[2] <- "sum_dist"

# merge the tables together
flights_rx5 <- merge(flights_rx4_1, flights_rx4_2, by="carrier", all=TRUE)

# sort the results
flights_rx5 <- flights_rx5[order(flights_rx5$mean_delay, decreasing=TRUE), ]

head(flights_rx5)

## ------------------------------------------------------------------------
flightsSmry <- flightsXdf %>%
    filter(month <= 6, year == 2013) %>%
    mutate(dist_km=distance*1.6093, delay=(arr_delay + dep_delay)/2) %>%
    group_by(carrier) %>%
    summarise(mean_delay=mean(delay), sum_dist=sum(dist_km)) %>%
    arrange(desc(mean_delay))

head(flightsSmry)

## ------------------------------------------------------------------------
rxLinMod(mean_delay ~ sum_dist + carrier, data=flightsSmry)

## ------------------------------------------------------------------------
airportsXdf <- rxDataStep(airports, "airports.xdf", overwrite=TRUE)
flightsJoin <- left_join(
    flightsXdf %>% select(year:day, hour, origin, dest, tailnum, carrier),
    airportsXdf,
    by=c("dest"="faa"))
head(flightsJoin)

## ----echo=FALSE, message=FALSE, results="hide"---------------------------
unlink(c("airports.xdf", "flights.xdf", "flights_rx1.xdf", "flights_rx2.xdf", "flights_rx3.xdf"))
clean_dplyrxdf_dir("native")
rxSetComputeContext(cc)
RevolutionAnalytics/dplyrXdf documentation built on June 3, 2019, 9:08 p.m.