#' @importFrom sf st_read
#' @importFrom utils read.csv
#' @importFrom methods is
#' @importFrom rgdal readOGR
#' @importFrom sf st_sfc
#' @importFrom sf st_simplify
#' @importFrom sf st_crs
#' @importFrom sf st_union
#' @importFrom sf st_write
#' @importFrom sf st_sf
#' @importFrom utils download.file
#' @importFrom utils unzip
NULL
#' @title Reads the New Yrok taxi trip data
#'
#' @description Reads the NYC Taxi & Limousine Commission green taxi trip CSV file
#' @param url the url of the New York taxi data for analysis. Parameter is optional, if missing will use the example dataset of the library
#' @export
#' @seealso \code{\link[nytaxi]{preprocess_dataset}}
#' @return A \code{\link{data.frame}} with all the csv file for future preprocessing and processing
#' @examples \dontrun{
#' read_NYC_trip_dataset("https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2017-02.csv")
#' #Or
#' read_NYC_trip_dataset("C:\\Downloads\\green_tripdata_2017-02.csv")
#' }
#' read_NYC_trip_dataset()
read_NYC_trip_dataset = function(url){
if(missing(url))
data = read.csv(file=system.file("extdata", "example.csv", package="nytaxi"))
else data = read.csv(url)
data
}
#' @title Preprocesses the New York taxi data CSV file to analyze
#'
#' @description Preprocessing includes checking the validity of the dataset, removing unneeded data columns and only leaves necesary ones,
#' parses data into the appropriate data types, creation of new columns that are necessary for analysis, adds and fills borough column for the
#' pickup and drop locations
#' @param taxidata is the dataset resulted from \code{\link[nytaxi]{read_NYC_trip_dataset}}
#' @param zones_newyork is the New York zones sf object, if not present will be downloaded from NYC website
#' @export
#' @seealso \code{\link[nytaxi]{read_NYC_trip_dataset}}
#' @return A \code{\link{data.frame}} that has the resulted dataset after preprocessing
#' @examples \dontrun{
#' preprocess_dataset(read_NYC_trip_dataset())
#' }
#'
preprocess_dataset = function(taxidata, zones_newyork){
if(is(taxidata, "data.frame"))
{
if("lpep_pickup_datetime" %in% colnames(taxidata) &&
"lpep_dropoff_datetime" %in% colnames(taxidata) &&
"fare_amount" %in% colnames(taxidata) &&
"tip_amount" %in% colnames(taxidata) &&
"PULocationID" %in% colnames(taxidata) &&
"DOLocationID" %in% colnames(taxidata))
{
#Filter only the needed columns for analysis
taxidata = taxidata[,c("lpep_pickup_datetime", "lpep_dropoff_datetime", "PULocationID", "DOLocationID","payment_type", "fare_amount", "tip_amount")]
if(missing(zones_newyork))
{
zones = download_NY_zones()
}
else zones = zones_newyork
#Convert the datetime columns into POSIXlt columns
taxidata$lpep_pickup_datetime = as.POSIXlt(taxidata$lpep_pickup_datetime)
taxidata$lpep_dropoff_datetime = as.POSIXlt(taxidata$lpep_dropoff_datetime)
#Create a column for the tip fare ratio
taxidata$tip_fare_ratio = taxidata$tip_amount / taxidata$fare_amount
#Create a field for Borough of the pick up location and drop location
taxidata$borough_PU = as.data.frame(zones)[taxidata$PULocationID,"borough"]
taxidata$borough_Drop = as.data.frame(zones)[taxidata$DOLocationID,"borough"]
#Create a column for the date part
taxidata$datepart = taxidata$lpep_dropoff_datetime$hour
taxidata$datepart[taxidata$datepart %in% 5:7] = "Early Morning"
taxidata$datepart[taxidata$datepart %in% 8:9] = "Morning"
taxidata$datepart[taxidata$datepart %in% 10:11] = "Late Morning"
taxidata$datepart[taxidata$datepart %in% 12:14] = "Afternoon"
taxidata$datepart[taxidata$datepart %in% 15:16] = "Late Afternoon"
taxidata$datepart[taxidata$datepart %in% 17:18] = "Early Evening"
taxidata$datepart[taxidata$datepart %in% 19:20] = "Evening"
taxidata$datepart[taxidata$datepart %in% 21:23] = "Night"
taxidata$datepart[taxidata$datepart %in% 0:4] = "After Midnight"
#Create a column with weekdays/weekends
taxidata$weekday = taxidata$lpep_dropoff_datetime$wday
taxidata$isweekend[taxidata$weekday %in% 0:4] = FALSE
taxidata$isweekend[taxidata$weekday %in% 5:6] = TRUE
taxidata
}
else
{
e <- simpleError("Necessary columns are not included in the dataset")
stop(e)
}
}
else {
e <- simpleError("Parameter expected of type data.frame")
stop(e)
}
}
##' @title Download NYC zones from NYC website
#'
#' @description Downloads the zip file from NYC website (https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip)
#' unzips it to a temporary directory, read it and then delete the temporary files
#' @param as_sf if needed the zones can be returned as an sf object
#' @return A \code{\link{SpatialPolygonsDataFrame}} with the NY zones
#' @export
#' @examples \dontrun{
#' zones = download_NY_zones()
#' #or
#' zones = download_NY_zones(as_sf = T) #to download as sf object
#' }
download_NY_zones = function(as_sf = FALSE){
url = "https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip"
tmpfile = tempfile()
download.file(url, tmpfile)
unzipped = unzip(tmpfile,exdir = tempdir())
shp = unzipped[tolower(substring(unzipped, nchar(unzipped)-3)) == ".shp"]
if(isTRUE(as_sf))
zones = st_read(dsn = shp)
else zones = readOGR(dsn = shp)
file.remove(unzipped)
zones
}
##' @title Creates New York borough features for doing borough level map visualization
#'
#' @description Reads zones shapefile provided by user, or download it from NYC website if not provided,
#' joins each borough's zones features together to create a geometry for each borough with its name attribute
#' @param zones_shapefile zones shapefile as an sf object, or downloads it from
#' https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip if it's left empty
#' @export
#' @seealso \code{\link[nytaxi]{read_NYC_trip_dataset}} and \code{\link[nytaxi]{preprocess_dataset}}
#' @return A \code{\link{SpatialPolygonsDataFrame}} with the borough's geometries and borough names
#' @examples \dontrun{
#' boroughs = create_borough_features()
#' }
create_borough_features = function(zones_shapefile){
zones = NA
if(missing(zones_shapefile))
{
zones = download_NY_zones(as_sf = T)
}
else if (is(zones_shapefile, "sf"))
{
zones = zones_shapefile
}
else
{
e <- simpleError("zones_shapefile should be an sf object or left empty")
stop(e)
}
borough_names = data.frame(borough = unique(zones$borough))
sfclist = st_sfc(crs = st_crs(zones))
for (i in 1:nrow(borough_names)) {
geom = st_simplify(st_union(zones[zones$borough == borough_names[i,],]), dTolerance = 150)
sfclist[i] = st_sfc(geometry = geom, crs = st_crs(zones))
}
dir.create("boroughs")
st_write(st_sf(data.frame(borough_names, geom=sfclist)), "boroughs/boroughs.shp", delete_layer = TRUE)
boroughs = readOGR(dsn = "boroughs/boroughs.shp")
unlink("boroughs", recursive = T)
boroughs
}
#' @title Returns zones data along with aggregate taxi trip data
#'
#' @description Reads the zones shapefile into SpatialPolygonsDataFrame, aggregates overall tip_fare ratio mean, and also for weekedays and weekends
#' and return back the SpatialPolygonsDataFrame
#' @param taxidata The taxi trip data that were read from \code{\link[nytaxi]{read_NYC_trip_dataset}} and preprocessed \code{\link[nytaxi]{preprocess_dataset}}
#' @param zones_newyork is the New York zones sf object, if not present will be downloaded from NYC website
#' @export
#' @seealso \code{\link[nytaxi]{read_NYC_trip_dataset}} and \code{\link[nytaxi]{preprocess_dataset}}
#' @return A \code{\link{SpatialPolygonsDataFrame}} with the zones features with the \code{mean_tip_fair_ratio}, \code{mean_ratio_weekend} and \code{mean_ratio_weekday}
#' @examples \dontrun{
#' url = "https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2017-02.csv"
#' taxidata = read_NYC_trip_dataset(url)
#' get_zones_with_aggregate_tripdata(taxidata)
#' }
get_zones_with_aggregate_tripdata = function(taxidata, zones_newyork){
if(missing(zones_newyork))
zones_newyork = download_NY_zones()
zones_newyork$mean_tip_fair_ratio = NA
zones_newyork$mean_ratio_weekend = NA
zones_newyork$mean_ratio_weekday = NA
zones_newyork$mean_ratio_weekday = NA
zones_newyork$mean_ratio_weekday = NA
zones_newyork$mean_ratio_weekday = NA
zones_newyork$mean_ratio_weekday = NA
zones_newyork$mean_ratio_weekday = NA
zones_newyork$mean_ratio_weekday = NA
zones_newyork$mean_ratio_weekday = NA
for (i in 1:nrow(zones_newyork)) {
zones_newyork$mean_tip_fair_ratio[i] = mean(taxidata[taxidata$DOLocationID == zones_newyork$LocationID[i],]$tip_fare_ratio, na.rm = T)
zones_newyork$mean_ratio_weekend[i] = mean(taxidata[(taxidata$DOLocationID == zones_newyork$LocationID[i]) & (taxidata$isweekend == T),]$tip_fare_ratio, na.rm = T)
zones_newyork$mean_ratio_weekday[i] = mean(taxidata[(taxidata$DOLocationID == zones_newyork$LocationID[i]) & (taxidata$isweekend == F),]$tip_fare_ratio, na.rm = T)
}
zones_newyork
}
#' @title aggregates data from taxi trip data into the boroughs level
#'
#' @description calculates mean tip-fare ratio for each borough for each day of the week, for each time of the day
#' and for weekend/non-weekend properties
#' @param taxidata The taxi trip data that were read from \code{\link[nytaxi]{read_NYC_trip_dataset}} and preprocessed \code{\link[nytaxi]{preprocess_dataset}}
#' @param boroughs is the New York zones sf object, if not present will be downloaded from NYC website
#' @export
#' @seealso \code{\link[nytaxi]{read_NYC_trip_dataset}} and \code{\link[nytaxi]{preprocess_dataset}}
#' @return A \code{\link{SpatialPolygonsDataFrame}} with the zones features with the \code{mean_tip_fair_ratio}, \code{mean_ratio_weekend} and \code{mean_ratio_weekday}
#' @examples \dontrun{
#' boroughs = create_borough_features()
#' get_boroughs_with_aggregate_trip_data(taxidata, boroughs)
#' }
get_boroughs_with_aggregate_trip_data = function(taxidata, boroughs){
if(missing(taxidata))
{
e <- simpleError("Taxi data is necessary for aggergating to boroughs level")
stop(e)
}
if(missing(boroughs))
boroughs = create_borough_features()
boroughs$Sunday_mean_tip_fair_ratio = NA
boroughs$Monday_mean_tip_fair_ratio = NA
boroughs$Tuesday_mean_tip_fair_ratio = NA
boroughs$Wednesday_mean_tip_fair_ratio = NA
boroughs$Thursday_mean_tip_fair_ratio = NA
boroughs$Friday_mean_tip_fair_ratio = NA
boroughs$Saturday_mean_tip_fair_ratio = NA
boroughs$EarlyMorning_mean_tip_fair_ratio = NA
boroughs$Morning_mean_tip_fair_ratio = NA
boroughs$LateMorning_mean_tip_fair_ratio = NA
boroughs$Afternoon_mean_tip_fair_ratio = NA
boroughs$LateAfternoon_mean_tip_fair_ratio = NA
boroughs$EarlyEvening_mean_tip_fair_ratio = NA
boroughs$Evening_mean_tip_fair_ratio = NA
boroughs$Night_mean_tip_fair_ratio = NA
boroughs$AfterMidnight_mean_tip_fair_ratio = NA
boroughs$weekday_mean_tip_fare_ratio = NA
boroughs$weekend_mean_tip_fare_ratio = NA
for (i in 1:nrow(boroughs)) {
boroughs$weekday_mean_tip_fare_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Night"),]$tip_fare_ratio, na.rm = T)
boroughs$weekend_mean_tip_fare_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "After Midnight"),]$tip_fare_ratio, na.rm = T)
boroughs$Sunday_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$weekday == 6),]$tip_fare_ratio, na.rm = T)
boroughs$Monday_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$weekday == 0),]$tip_fare_ratio, na.rm = T)
boroughs$Tuesday_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$weekday == 1),]$tip_fare_ratio, na.rm = T)
boroughs$Wednesday_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$weekday == 2),]$tip_fare_ratio, na.rm = T)
boroughs$Thursday_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$weekday == 3),]$tip_fare_ratio, na.rm = T)
boroughs$Friday_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$weekday == 4),]$tip_fare_ratio, na.rm = T)
boroughs$Saturday_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$weekday == 5),]$tip_fare_ratio, na.rm = T)
boroughs$EarlyMorning_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Early Morning"),]$tip_fare_ratio, na.rm = T)
boroughs$Morning_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Morning"),]$tip_fare_ratio, na.rm = T)
boroughs$LateMorning_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Late Morning"),]$tip_fare_ratio, na.rm = T)
boroughs$Afternoon_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Afternoon"),]$tip_fare_ratio, na.rm = T)
boroughs$LateAfternoon_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Late Afternoon"),]$tip_fare_ratio, na.rm = T)
boroughs$EarlyEvening_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Early Evening"),]$tip_fare_ratio, na.rm = T)
boroughs$Evening_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Evening"),]$tip_fare_ratio, na.rm = T)
boroughs$Night_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "Night"),]$tip_fare_ratio, na.rm = T)
boroughs$AfterMidnight_mean_tip_fair_ratio[i] = mean(taxidata[(taxidata$borough_Drop == boroughs$borough[i]) & (taxidata$datepart == "After Midnight"),]$tip_fare_ratio, na.rm = T)
}
boroughs
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.