R/create_estimation_data.R

Defines functions create_estimation_data

Documented in create_estimation_data

#' Create estimation data
#'
#' Create the dataset used for the estimation.
#'
#' @param sc Spark connection
#'
#' @export

create_estimation_data <- function(sc){

	sparklyr::sdf_sql(sc, "SELECT rides.dba_category, rides.pickup_zone, rides.pickup_date, rides.pickup_hour, rides.n, bike_zones.n_bikestation, CASE WHEN bike_zones.n_bikestation > 0 THEN 1 ELSE 0 END AS bikezone
						   FROM rides
						   LEFT JOIN bike_zones
					       ON rides.pickup_zone = bike_zones.zone_id") %>%
		dplyr::compute("rides_taken")

	sparklyr::sdf_sql(sc, "SELECT DISTINCT dba_category FROM rides") %>% dplyr::compute("firms")
	sparklyr::sdf_sql(sc, "SELECT DISTINCT pickup_zone FROM rides") %>% dplyr::compute("zones")
	sparklyr::sdf_sql(sc, "SELECT DISTINCT pickup_date FROM rides") %>% dplyr::compute("dates")
	sparklyr::sdf_sql(sc, "SELECT DISTINCT pickup_hour FROM rides") %>% dplyr::compute("hours")

	sparklyr::sdf_sql(sc, "SELECT * FROM firms
						   CROSS JOIN zones
						   CROSS JOIN dates
					       CROSS JOIN hours") %>% dplyr::compute("grid")

	sparklyr::sdf_sql(sc, "SELECT (CASE WHEN g.dba_category = 'lyft' THEN 1 ELSE 0 END) AS lyft, g.pickup_date,  g.pickup_hour, g.pickup_zone, r.bikezone, r.n_bikestation, r.n
						   FROM grid AS g
						   LEFT JOIN rides_taken AS r
						   ON g.dba_category = r.dba_category AND g.pickup_zone = r.pickup_zone AND g.pickup_date = r.pickup_date AND g.pickup_hour = r.pickup_hour") %>%
		dplyr::compute("paperdata")

	sparklyr::sdf_sql(sc, "DROP TABLE grid")
	sparklyr::sdf_sql(sc, "DROP TABLE rides_taken")
	sparklyr::sdf_sql(sc, "DROP TABLE firms")
	sparklyr::sdf_sql(sc, "DROP TABLE zones")
	sparklyr::sdf_sql(sc, "DROP TABLE dates")
	sparklyr::sdf_sql(sc, "DROP TABLE hours")
	sparklyr::sdf_sql(sc, "DROP TABLE rides")
	sparklyr::sdf_sql(sc, "DROP TABLE bike_zones")

	return(NULL)




}
hktosun/gozentosun2021 documentation built on Dec. 20, 2021, 4:44 p.m.