#' Create estimation data
#'
#' Create the dataset used for the estimation.
#'
#' @param sc Spark connection
#'
#' @export
create_estimation_data <- function(sc){
sparklyr::sdf_sql(sc, "SELECT rides.dba_category, rides.pickup_zone, rides.pickup_date, rides.pickup_hour, rides.n, bike_zones.n_bikestation, CASE WHEN bike_zones.n_bikestation > 0 THEN 1 ELSE 0 END AS bikezone
FROM rides
LEFT JOIN bike_zones
ON rides.pickup_zone = bike_zones.zone_id") %>%
dplyr::compute("rides_taken")
sparklyr::sdf_sql(sc, "SELECT DISTINCT dba_category FROM rides") %>% dplyr::compute("firms")
sparklyr::sdf_sql(sc, "SELECT DISTINCT pickup_zone FROM rides") %>% dplyr::compute("zones")
sparklyr::sdf_sql(sc, "SELECT DISTINCT pickup_date FROM rides") %>% dplyr::compute("dates")
sparklyr::sdf_sql(sc, "SELECT DISTINCT pickup_hour FROM rides") %>% dplyr::compute("hours")
sparklyr::sdf_sql(sc, "SELECT * FROM firms
CROSS JOIN zones
CROSS JOIN dates
CROSS JOIN hours") %>% dplyr::compute("grid")
sparklyr::sdf_sql(sc, "SELECT (CASE WHEN g.dba_category = 'lyft' THEN 1 ELSE 0 END) AS lyft, g.pickup_date, g.pickup_hour, g.pickup_zone, r.bikezone, r.n_bikestation, r.n
FROM grid AS g
LEFT JOIN rides_taken AS r
ON g.dba_category = r.dba_category AND g.pickup_zone = r.pickup_zone AND g.pickup_date = r.pickup_date AND g.pickup_hour = r.pickup_hour") %>%
dplyr::compute("paperdata")
sparklyr::sdf_sql(sc, "DROP TABLE grid")
sparklyr::sdf_sql(sc, "DROP TABLE rides_taken")
sparklyr::sdf_sql(sc, "DROP TABLE firms")
sparklyr::sdf_sql(sc, "DROP TABLE zones")
sparklyr::sdf_sql(sc, "DROP TABLE dates")
sparklyr::sdf_sql(sc, "DROP TABLE hours")
sparklyr::sdf_sql(sc, "DROP TABLE rides")
sparklyr::sdf_sql(sc, "DROP TABLE bike_zones")
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.