R/sfread.R

Defines functions fixListColClasses fixCharacterColClasses readHeader sfread

# Borrowed from pdapbase pkg

#' Safe, fast, flat-file reader
#'
#' Wrapper to  \code{data.table::fread} that reads a flat file, providing
#' additional fixing of the \code{colClasses} argument, and a default behavior
#' to treat large numbers as numeric rather than integer64.
#'
#' @param input A string giving the name of a file (or other input types
#' supported by \code{\link[data.table]{fread}}).
#' @param ... Passed to  \code{\link[data.table]{fread}}.
#' @param colClasses A list or character vector specifying the output format
#' for columns.
#' @param integer64 controls whether numeric values are read in as 'integer64'
#' or as 'numeric'
#' @return A data.table.
#' @note The only difference between this function and \code{data.table::fread}
#' is that columns specified in the \code{colClasses} argument that don't
#' exist in the file being read are ignored rather than throwing an error.
#'
#' This function contains two improvements over the original \code{fread}.
#' First: its default behaviour is to read in large integers as 'numeric' rather
#' than 'integer64' (in contrast to fread{data.table}). At this point in time,
#' much of R doesn't seem to be yet ready to handle the integer64 class
#' correctly. The call data.matrix(aDataFrameWithInt64) doesn't convert
#' correctly. The Architect data viewer doesn't display integer64 correctly.
#' The second feature of this function is that it caters adequately for
#' files with sparse columns of data where the type is hard to auto-guess, and
#' where those columns do not always exist in the dataset.
#' Many of the text files generated by MaxQuant have this property; elsewhere it
#' should be incredibly rare.
#' @seealso \code{\link[data.table]{fread}}
#' @examples
#' if (interactive()) {
#'   # In both these examples, colClasses specifies behaviour for non-existent
#'   # column "baz".
#'   # List syntax
#'   sfread("foo,bar\n1,a", colClasses = list(numeric = c("foo", "baz")))
#'   # Character syntax
#'   sfread("foo,bar\n1,a", colClasses = c(foo = "numeric", baz = "numeric"))
#'
#'   # Compare to fread
#'   assertive.base::dont_stop(
#'     data.table::fread(
#'       "foo,bar\n1,a",
#'       colClasses = list(numeric = c("foo", "baz"))
#'     )
#'   )
#'   assertive.base::dont_stop(
#'     data.table::fread(
#'       "foo,bar\n1,a",
#'       colClasses = c(foo = "numeric", baz = "numeric")
#'     )
#'   )
#' }
#' @importFrom data.table fread
#' @noRd
sfread <- function(input, ..., colClasses = NULL, integer64 = 'numeric')
{
    dots <- within(
        list(...),
        {
            input <- input
        }
    )

    theHeader <- do.call(readHeader, dots)
    colClasses <- switch(
        class(colClasses),
        character = fixCharacterColClasses(colClasses, theHeader),
        list      = fixListColClasses(colClasses, theHeader)
    )
    fread(input, ..., colClasses = colClasses, integer64 = integer64)
}

#' Read the header of a flat file
#'
#' Wrapper to \code{data.table::fread} that reads the header line of a flat
#' file.
#' @param input A string giving the name of a file (or other input types
#' supported by \code{\link[data.table]{fread}}).
#' @param ... Passed to  \code{\link[data.table]{fread}}.
#' @return A character vector of column names.
#' @importFrom data.table fread
#' @noRd
readHeader <- function(input, nrows = 0, header = TRUE,  ...)
{
    DT <- suppressWarnings(fread(input, nrows = nrows, header = header, ...))
    colnames(DT)
}

fixCharacterColClasses <- function(colClasses, header)
{
    intersect(colClasses, header)
}

fixListColClasses <- function(colClasses, header)
{
    lapply(colClasses, intersect, y = header)
}
graumannlab/readat documentation built on May 16, 2020, 10:15 p.m.