R/import.R
In trread: Read, Validate, and Analyze Files in the General Transit Feed Specification

Documented in download_from_url import_gtfs read_gtfs unzip_file

#' Get and validate dataframes of General Transit Feed Specification (GTFS) data.
#' 
#' This function reads GTFS text files from a local or remote zip file. 
#' It also validates the files against the GTFS specification by file, requirement status, and column name
#' The data are returned as a list of dataframes and a validation object, 
#' which contains details on whether all required files were found, 
#' and which required and optional columns are present. 
#' 
#'
#' @param path Character. url link to zip file OR path to local zip file. if to local path, then option `local` must be set to TRUE.
#' @param local Boolean. If the paths are searching locally or not. Default is FALSE (that is, urls).
#' @param quiet Boolean. Whether to see file download progress and files extract. FALSE by default.
#' @param frequency Boolean. Whether to add frequency/headway calculations to the gtfs object
#'
#' @return A GTFS object. That is, a list of dataframes of GTFS data.
#'
#' @export
#' @importFrom dplyr %>% arrange summarise group_by inner_join
#' @examples \donttest{
#' library(dplyr)
#' u1 <- "https://github.com/r-transit/trread/raw/master/inst/extdata/sample-feed-fixed.zip"
#' sample_gtfs <- read_gtfs(u1)
#' attach(sample_gtfs)
#' #list routes by the number of stops they have
#' routes_df %>% inner_join(trips_df, by="route_id") %>%
#'   inner_join(stop_times_df) %>% 
#'     inner_join(stops_df, by="stop_id") %>% 
#'       group_by(route_long_name) %>%
#'         summarise(stop_count=n_distinct(stop_id)) %>%
#'           arrange(desc(stop_count))
#' }
read_gtfs <- function(path, local = FALSE, 
                      quiet = TRUE, 
                      frequency=FALSE) {
  # download zip file
  if(!local) {
    path <- download_from_url(url = path, quiet = quiet)
    if(is.null(path)) { return() }
  }
  
  # extract zip file
  tmpdirpath <- unzip_file(path, quiet=quiet)
  
  file_list_df <- zip::zip_list(path)
  if(!exists("file_list_df")) {
    stop(sprintf("No files found in zip"))
  }
  
  gtfs_obj <- create_gtfs_object(tmpdirpath, file_list_df$filename, quiet = quiet)
  
  if(frequency) {
    gtfs_obj <- get_route_frequency(gtfs_obj) 
  }
  
  return(gtfs_obj) 
}

#' This function is deprecated. Please use read_gtfs
#' 
#' This function reads GTFS text files from a local or remote zip file. 
#' It also validates the files against the GTFS specification by file, requirement status, and column name
#' The data are returned as a list of dataframes and a validation object, 
#' which contains details on whether all required files were found, 
#' and which required and optional columns are present. 
#' 
#'
#' @param path Character. url link to zip file OR path to local zip file. if to local path, then option `local` must be set to TRUE.
#' @param local Boolean. If the paths are searching locally or not. Default is FALSE (that is, urls).
#' @param quiet Boolean. Whether to see file download progress and files extract. FALSE by default.
#'
#' @return Dataframes of GTFS data.
#'
#' @export
#' @importFrom dplyr %>% arrange summarise group_by inner_join
#' @examples \donttest{
#' library(dplyr)
#' u1 <- "https://github.com/r-transit/trread/raw/master/inst/extdata/sample-feed-fixed.zip"
#' sample_gtfs <- import_gtfs(u1)
#' attach(sample_gtfs)
#' #list routes by the number of stops they have
#' routes_df %>% inner_join(trips_df, by="route_id") %>%
#'   inner_join(stop_times_df) %>%
#'     inner_join(stops_df, by="stop_id") %>%
#'       group_by(route_long_name) %>%
#'         summarise(stop_count=n_distinct(stop_id)) %>%
#'           arrange(desc(stop_count))
#' }
import_gtfs <- function(path, local = FALSE, quiet = FALSE) {
  .Deprecated("read_gtfs") #include a package argument, too
  read_gtfs(path, local = FALSE, quiet = FALSE)
}

#' Download a zipped GTFS feed file from a url
#'
#' @param url Character URL of GTFS feed.
#' @param path Character. Folder into which to put zipped file. If NULL, then save a tempfile
#' @param quiet Boolean. Whether to see file download progress. FALSE by default.
#'
#' @return File path
#' @importFrom dplyr %>%
#'
#' @keywords internal

download_from_url <- function(url, path=tempfile(fileext = ".zip"), quiet=FALSE) {
  
  stopifnot(length(url) == 1)
  
  # check if single element of dataframe was inputed. if so, convert to single value; error otherwise.
  if(!is.null(dim(url))) {
    if(all(dim(url) == c(1,1))) {
      url <- unlist(url, use.names = FALSE)
    } else {
      stop('Please input a single url.')
    }
  }
  
  # check if url links to a zip file
  if(!valid_url(url)) {
    stop1 <- sprintf("Link '%s' is invalid; failed to connect.", url)
    stop(stop1)
  }
  
  r <- httr::GET(url)
  
  # Get gtfs zip if url can be reach
  if(httr::status_code(r) == 200) {
    check <- try(utils::download.file(url, path, quiet = quiet), silent=TRUE)
    if(check %>% assertthat::is.error()) {
      warn <- sprintf("Link '%s' failed to download. NULL was returned.", url)
      warning(warn)
      return(NULL)
    }
  } else {
    warn <- sprintf("Link '%s' cannot be reached. NULL was returned.", url)
    warning(warn)
    return(NULL)
  }
  
  # check path
  check <- try(normalizePath(path), silent = TRUE)
  if(assertthat::is.error(check)) {
    warn <- 'Invalid file path. NULL is returned.'
    warning(warn)
    return(NULL)
  }
  return(path)
}

#' Checks UTF-8-BOM encoding.
#' 
#' Special thanks to @patperu for finding the issue and 
#' to @hrbrmstr for the code to help deal with the issue.
#' 
#' @param path the path the the text file
#' @param encoding can be one of \code{UTF-8}, \code{UTF-16} or \code{UTF-16BE}.
#'        Although a BOM could be used with UTF-32 and other encodings, such
#'        encodings are rarely used for data transmission and the three supported
#'        encodings are the most likely ones folks in R will be working with from
#'        web APIs.\cr\cr
#'        This function defaults to looking for \code{UTF-8} BOM, but you can
#'        override it.
#' @return \code{TRUE} if response contains a BOM, \code{NA} if an unsupported encoding
#'         was passed (along with a message)
#' @references \href{http://www.unicode.org/faq/utf_bom.html}{UTF-8, UTF-16, UTF-32 & BOM}
#' @noRd
#' @keywords internal
#' @author @@hrbrmstr

has_bom <- function(path, encoding="UTF-8") {

  B <- readBin(path, "raw", 4, 1)
  switch(encoding,
       `UTF-8`=B[1]==as.raw(0xef) & B[2]==as.raw(0xbb) & B[3]==as.raw(0xbf),
       `UTF-16`=B[1]==as.raw(0xff) & B[2]==as.raw(0xfe),
       `UTF-16BE`=B[1]==as.raw(0xfe) & B[2]==as.raw(0xff),
       { message("Unsupported encoding") ; return(NA) }
  )
}

#' Unzip a file and delete zip
#'
#' @param zipfile path to zipped file
#' @param tmpdirpath path to unzip file to-default tempdir()
#' @param quiet Boolean. Whether to output files found in folder.
#' @importFrom tools file_ext
#' 
#' @return file path to directory with gtfs .txt files
#' @keywords internal
#' 

unzip_file <- function(zipfile, 
                       tmpdirpath=tempdir(), 
                       quiet = FALSE) {
  f <- zipfile
  
  # check path
  if(try(path.expand(f), silent = TRUE) %>% assertthat::is.error()) {
    warn <- 'Invalid file path. NULL is returned.'
    warning(warn)
    return(NULL)
  }

  f <- normalizePath(f)

  if(tools::file_ext(f) != "zip") {
    if(!quiet) message('No zip file found, reading files from path.')
    return(f)
  }
  
  # create extraction folder
  utils::unzip(f, exdir=tmpdirpath)


  if(length(list.files(tmpdirpath)) == 0) {
    warn <- "No files found after decompressing. NULL is returned."
    return(NULL)
  }

  if(!quiet) {
    message(sprintf("Unzipped the following files to directory '%s'...", tmpdirpath))
    list.files(tmpdirpath) %>% print
  }

  return(tmpdirpath)
}

#' Function to read all files into dataframes
#'
#' @param file_path Character file path
#' @param assign_envir Environment Object. Option of where to assign dataframes.
#' @param quiet Boolean. Whether to output messages and files found in folder.
#' @noRd
#' @keywords internal

create_gtfs_object <- function(tmpdirpath, file_paths, quiet = FALSE) {
  prefixes <- vapply(file_paths,get_file_shortname,FUN.VALUE = "")
  df_names <- paste(prefixes,"_df",sep="")
  if(!quiet) message('Reading files in feed...\n')
  gtfs_obj <- lapply(file_paths, 
                   function(x) read_gtfs_file(x, 
                                              tmpdirpath, 
                                              quiet = quiet))
  names(gtfs_obj) <- unname(df_names)
  gtfs_obj[sapply(gtfs_obj, is.null)] <- NULL
  class(gtfs_obj) <- "gtfs"
  if(!quiet) message('Reading files in feed... done.\n')
  
    
  gtfs_obj <- validate_gtfs(gtfs_obj, quiet = quiet)
  
  stopifnot(is_gtfs_obj(gtfs_obj))
  
  if(!quiet) message("Reading gtfs feed completed.\n\n")
  
  return(gtfs_obj)
}


#' Function to read all files into dataframes
#'
#' @param file_path Character file path
#' @param tmpdirpath path for the tmpdir files
#' @param quiet Boolean. Whether to output messages and files found in folder.
#' @noRd
#' @keywords internal

read_gtfs_file <- function(file_path, tmpdirpath, quiet = FALSE) {
  prefix <- get_file_shortname(file_path)

  if(!quiet) message(paste0('Reading ', prefix))

  full_file_path <- paste0(tmpdirpath,"/",file_path)
  new_df <- parse_gtfs_file(prefix, full_file_path, quiet = quiet)

  return(new_df)
}

#' Function to get the gtfs table name from the file string
#'
#' @param file_path Character file path
#' @return df_name a character vector of the df_name for the file
#' @noRd
#' @keywords internal
#' 
get_file_shortname <- function(file_path) {
  split_path <- strsplit(file_path, '/')
  file_name <- split_path[[1]][length(split_path[[1]])]

  prefix <- gsub('.txt|-new', '', file_name) 
  # suffix '.*-new.txt' comes from trillium data
  prefix <- gsub('\\-|\\.', '_', prefix)
  return(prefix)
}

#' Parses one gtfs file
#'
#' @param prefix Character. gtfs file prefix (e.g. 'agency', 'stop_times', etc.)
#' @param file_path Character. file path
#' @param quiet Boolean. Whether to output messages and files found in folder.
#' @return Dataframe of parsed GTFS file.
#' @noRd
#' @keywords internal
#' @importFrom data.table fread

parse_gtfs_file <- function(prefix, file_path, quiet = FALSE) {

  # only parse if file has any data, NULL o/w
  stopifnot(!is.na(file.size(file_path)))
  if(file.size(file_path) > 1) {

    ## get correct meta data using file prefix (e.g. 'agency', 'stop_times')
    meta <- get_gtfs_meta()[[prefix]]

    # check if a file is empty. If so, return NULL.
    L <- suppressWarnings(length(scan(file_path, what = "", quiet = TRUE, sep = '\n')))
    if(L < 1) {
      s <- sprintf("   File '%s' is empty.", basename(file_path))
      if(!quiet) message(s)
      return(NULL)
    }

    # if no meta data is found for a file type but file is not empty, read as is.
    if(is.null(meta)) {
      s <- sprintf("   File %s not recognized, trying to read file as csv.", basename(file_path))
      if(!quiet) message(s)

      tryCatch({
        df <- suppressMessages(data.table::fread(file = file_path, sep=","))
      }, error = function(error_condition) {
        s <- sprintf("   File could not be read as csv.", basename(file_path))
        if(!quiet) message(s)
        return(NULL)
      })
      return(df)
    }

    ## read.csv supports UTF-8-BOM. use this to get field names.
    small_df <- suppressWarnings(utils::read.csv(file_path, nrows = 5, stringsAsFactors = FALSE)) # get a small df to find how many cols are needed

    ## get correct coltype, if possible
    coltypes_character <- rep('c', dim(small_df)[2]) # create 'c' as coltype defaults
    names(coltypes_character) <- names(small_df) %>% tolower()
    indx <- match(names(coltypes_character), meta$field)  # indx from valid cols in meta$field. NAs will return for invalid cols

    ## !is.na(indx) = valid col in 'coltype' found in meta$field
    ## indx[!is.na(indx)] = location in 'meta$coltype' where corresponding type is found
    coltypes_character[!is.na(indx)] <- meta$coltype[indx[!is.na(indx)]] # valid cols found in small_df

    # use col_*() notation for column types
    coltypes <-
      sapply(
        coltypes_character,
        switch,
        "c" = readr::col_character(),
        "i" = readr::col_integer(),
        "d" = readr::col_double(),
        "D" = readr::col_date(format = "%Y%m%d")
      )
    
    if (has_bom(file_path)) { # check for BOM. if yes, use read.csv()
      ## switch function
      converttype <- function(x, y) {
        switch(x, character = as.character(y), integer = as.integer(y), double = as.double(y), Date = lubridate::ymd(y))
      }
      colnms <- meta$field[indx] # get expected/required names for columns. these are imposed.
      
      ## get colclasses
      colclasses <- sapply(coltypes_character, switch, c = "character", i = "integer", d = "double", "D" = "Date")
      
      csv <- quote(utils::read.csv(file_path, col.names = colnms, stringsAsFactors= FALSE))
      df <- try(suppressWarnings(eval(csv)) %>%
          mapply(converttype, x = colclasses, y = ., SIMPLIFY = FALSE) %>% # ensure proper column types
          tibble::as_tibble())

      if(any(class(df) %in% "try-error")) {
        probs <- "Error during import. Likely encoding error. Note that utils::read.csv() was used, not readr::read_csv()."
        attributes(df) <- append(attributes(df), list(problems = probs))
      }
    } else {
      df <- suppressWarnings(
        readr::read_csv(file = file_path, 
          col_types = coltypes
        )
      )
      probs <- readr::problems(df)
      
      if(dim(probs)[1] > 0) {
        attributes(df) <- append(attributes(df), list(problems = probs))
        warning(paste0("Parsing failures while reading ", prefix))
        print(probs)
      }
    }

    return(df)
  } else return(NULL)

}

Any scripts or data that you put into this service are public.

trread documentation built on May 1, 2019, 10:14 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

trread
Read, Validate, and Analyze Files in the General Transit Feed Specification

R/import.R
In trread: Read, Validate, and Analyze Files in the General Transit Feed Specification

Defines functions read_gtfs import_gtfs download_from_url has_bom unzip_file create_gtfs_object read_gtfs_file get_file_shortname parse_gtfs_file

Documented in download_from_url import_gtfs read_gtfs unzip_file

Try the trread package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

trread Read, Validate, and Analyze Files in the General Transit Feed Specification

R/import.R In trread: Read, Validate, and Analyze Files in the General Transit Feed Specification

Defines functions read_gtfs import_gtfs download_from_url has_bom unzip_file create_gtfs_object read_gtfs_file get_file_shortname parse_gtfs_file

Documented in download_from_url import_gtfs read_gtfs unzip_file

Try the trread package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

trread
Read, Validate, and Analyze Files in the General Transit Feed Specification

R/import.R
In trread: Read, Validate, and Analyze Files in the General Transit Feed Specification