R/dl-bikedata.R

#' Download hire bicycle data
#'
#' Download data for subsequent storage via \link{store_bikedata}.
#'
#' @param city City for which to download bike data, or name of corresponding
#' bike system (see Details below).
#' @param data_dir Directory to which to download the files
#' @param dates Character vector of dates to download data with dates formated
#' as YYYYMM.
#' @param quiet If FALSE, progress is displayed on screen
#'
#' @section Details:
#' This function produces (generally) zip-compressed data in R's temporary
#' directory. City names are not case sensitive, and must only be long enough to
#' unambiguously designate the desired city. Names of corresponding bike systems
#' can also be given.  Currently possible cities (with minimal designations in
#' parentheses) and names of bike hire systems are:
#' \tabular{lr}{
#'  Boston (bo)\tab Hubway\cr
#'  Chicago (ch)\tab Divvy Bikes\cr
#'  Washington, D.C. (dc)\tab Capital Bike Share\cr
#'  Los Angeles (la)\tab Metro Bike Share\cr
#'  London (lo)\tab Santander Cycles\cr
#'  Minnesota (mn)\tab NiceRide\cr
#'  New York City (ny)\tab Citibike\cr
#'  Philadelphia (ph)\tab Indego\cr
#'  San Francisco Bay Area (sf)\tab Ford GoBike\cr
#' }
#'
#' Ensure you have a fast internet connection and at least 100 Mb space
#'
#' @note Only files that don't already exist in \code{data_dir} will be
#' downloaded, and this function may thus be used to update a directory of files
#' by downloading more recent files. If a particular file request fails,
#' downloading will continue regardless. To ensure all files are downloaded,
#' this function may need to be run several times until a message appears
#' declaring that 'All data files already exist'
#'
#' @export
#'
#' @examples
#' \dontrun{
#' dl_bikedata (city = 'New York City USA', dates = 201601:201613)
#' }
dl_bikedata <- function (city, data_dir = tempdir(), dates = NULL,
                         quiet = FALSE) {

    if (missing (city))
        stop ("city must be specified for dl_bikedata()")

    city <- convert_city_names (city)
    if (city == "mn")
        warning ("Data for the Nice Ride MN system must be downloaded ",
                 "manually from\nhttps://www.niceridemn.com/system-data/, and ",
                 "loaded using store_bikedata")

    dl_files <- get_bike_files (city)
    data_dir <- expand_home (data_dir) %>%
        check_data_dir () # check for existence and create if non-existent
    files <- file.path (data_dir, basename (dl_files))

    dates_exist <- TRUE # set to F if requested dates do not exist
    if (is.null (dates))
        index <- which (!file.exists (files))
    else {

        dates <- bike_convert_dates (dates) %>%
            expand_dates_to_range () %>%
            convert_dates_to_filenames (city = city) %>%
            sort ()
        dates <- unique (c (dates, tolower (dates)))
        index <- which (grepl (paste (dates, collapse = "|"), files,
                               ignore.case = TRUE))
        if (length (index) == 0)
            dates_exist <- FALSE
        else
            index <- which (!file.exists (files) &
                           grepl (paste (dates, collapse = "|"), files,
                                  ignore.case = TRUE))
    }

    if (length (index) > 0) {

        for (f in dl_files [index]) {

            # replace whitespace in URLs (see issue#53)
            furl <- gsub (" ", "%20", f)
            f <- gsub (" ", "", f)
            destfile <- file.path (data_dir, basename(f))
            if (!quiet)
                message ("Downloading ", basename (f))
            resp <- httr::GET (furl,
                               httr::write_disk (destfile, overwrite = TRUE))
            if (resp$status_code != 200) {

                count <- 0
                while (!file.exists (destfile) & count < 5) {

                    resp <- httr::GET (furl,
                                       httr::write_disk (destfile,
                                                         overwrite = TRUE))
                    count <- count + 1
                }
                if (!file.exists (destfile))
                    stop ("Download request failed")
                # some junk files are also listed on AWS but not downloadable
                if (resp$status_code != 200 & file.exists (destfile))
                    chk <- file.remove (destfile)
            }
        }
    } else {

        if (!dates_exist)
            message ("There are no ", city, " files for those dates")
        else
            message ("All data files already exist")
    }

    ptn <- ".zip"
    if (city == "lo") {

        # London has raw csv files too, but sometimes the server delivers junk
        # data with default files that are very small. The following suffices to
        # remove these:
        csvs <- file.path (data_dir, list.files (data_dir, pattern = ".csv"))
        indx <- which (file.info (csvs)$size < 1000)
        invisible (tryCatch (file.remove (csvs [indx]),
                             warning = function (w) NULL,
                             error = function (e) NULL))
        # There is also now one .xlxs file (#49, March 2017), which is converted
        # to .csv here
        xls <- file.path (data_dir, list.files (data_dir, pattern = ".xlsx"))
        for (f in xls) {

            fcsv <- paste0 (tools::file_path_sans_ext (f), ".csv")
            readxl::read_xlsx (f) %>%
                utils::write.csv (, file = fcsv, row.names = FALSE)
            chk <- file.remove (f)
        }
        ptn <- paste0 (ptn, "|.csv")
    } else if (city == "ny") {

        # ny also has some junk .zip files, but temporarily disabled because all
        # seems okay again now? (Oct 17)
        #zips <- file.path (data_dir, list.files (data_dir, pattern = '.zip'))
        #indx <- which (file.info (zips)$size < 1000)
        #invisible (tryCatch (file.remove (zips [indx]),
        #                     warning = function (w) NULL,
        #                     error = function (e) NULL))
    } else if (city == "gu" & length (index) > 0L) {

        # https://github.com/ropensci/bikedata/issues/106
        # Genders in quoted versions are unquoted when not reported. Could be
        # fixed in src code, but only by scanning each line for quotation
        # structure which makes it way slower. This instead replaces these
        # values with quoted versions.

        message ("Tidying Guadalajara files ... ", appendLF = FALSE)
        csvs <- list.files (data_dir, pattern = "^datos\\_", full.names = TRUE)
        for (i in csvs) {
            ptn <- ",\\s?NULL\\s?,"
            x <- brio::read_lines (i)
            if (any (grepl (ptn, x))) {
                x <- gsub (ptn, ",\"NULL\",", x)
                brio::write_lines (x, i)
            }
        }
        message ("done.")
    }

    ret <- list.files (data_dir, pattern = ptn, full.names = TRUE)
    invisible (ret [!grepl ("field_names", ret)])
}

#' @rdname dl_bikedata
#' @export
download_bikedata <- function (city, data_dir = tempdir(), dates = NULL,
                               quiet = FALSE) {

    dl_bikedata (city = city, data_dir = data_dir, dates = dates, quiet = quiet)
}
mpadge/bikedata documentation built on Feb. 11, 2024, 4:22 p.m.