#-------------------------------------------------------------------------------
#
# Package csvread
#
# Function csvread
#
# Sergei Izrailev, 2011-2014
#-------------------------------------------------------------------------------
# Copyright 2011-2014 Collective, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-------------------------------------------------------------------------------
#' Given a list of the column types, function \code{csvread} parses the CSV file
#' and returns a data frame.
#'
#' \code{csvread} provides functionality for loading large (10M+ lines) CSV
#' and other delimited files, similar to read.csv, but typically faster and
#' using less memory than the standard R loader. While not entirely general,
#' it covers many common use cases when the types of columns in the CSV file
#' are known in advance. In addition, the package provides a class 'int64',
#' which represents 64-bit integers exactly when reading from a file. The
#' latter is useful when working with 64-bit integer identifiers exported from
#' databases. The CSV file loader supports common column types including
#' \code{integer}, \code{double}, \code{string}, and \code{int64}, leaving
#' further type transformations to the user.
#'
#' If number of columns, which is inferred from the number of provided \code{coltypes}, is greater than
#' the actual number of columns, the extra columns are still created. If the number of columns is
#' less than the actual number of columns in the file, the extra columns in the file are ignored.
#' Commas included in double quotes will be considered part of the field, rather than a separator, but
#' double quotes will NOT be stripped. Runaway double quotes will end at the end of the line.
#'
#' See also \code{\link{int64}} for information about dealing with 64-bit
#' integers when loading data from CSV files.
#'
#' @param file Path to the CSV file.
#' @param coltypes A vector of column types, e.g., \code{c("integer", "string")}.
#' The accepted types are "integer", "double", "string", "long" and "longhex".
#' \itemize{
#' \item \code{integer} - the column is parsed into an R integer type (32 bit)
#' \item \code{double} - the column is parsed into an R double type
#' \item \code{string} - the column is loaded as character type
#' \item \code{long} - the column is interpreted as the decimal representation of a 64-bit
#' integer, stored as a double and assigned the \code{\link{int64}} class.
#' \item \code{longhex} - the column is interpreted as the hex representation of a 64-bit
#' integer, stored as a double and assigned the \code{\link{int64}} class
#' with an additional attribute \code{base = 16L} that is used for printing.
#' \item \code{integer64} - same as \code{long} but produces a column of class \code{integer64},
#' which should be compatible with package \code{bit64} (untested).
#' \item \code{verbose} - if \code{TRUE}, the function prints number of lines counted in the file.
#' \item \code{delimiter} - a single character delimiter, defalut is \code{","}.
#' }
#' @param header TRUE (default) or FALSE; indicates whether the file has a header
#' and serves as the source of column names if \code{colnames} is not provided.
#' @param colnames Optional column names for the resulting data frame. Overrides the header, if header is present.
#' If NULL, then the column names are taken from the header, or, if there is no header,
#' the column names are set to 'COL1', 'COL2', etc.
#' @param nrows If NULL, the function first counts the lines in the file. This step can be avoided if the number
#' of lines is known by providing a value to \code{nrows}. On the other hand, \code{nrows} can be
#' used to read only the first lines of the CSV file.
#' @param verbose If \code{TRUE} and \code{nrows} is \code{NULL}, the function prints
#' number of lines counted in the file.
#' @param delimiter A single character delimiter, defalut is \code{","}.
#'
#' @return A data frame containing the data from the CSV file.
#' @examples
#' \dontrun{
#' frm <- csvread("inst/10rows.csv",
#' coltypes = c("longhex", "string", "double", "integer", "long"),
#' header = FALSE, nrows = 10)
#' frm
#' # COL1 COL2 COL3 COL4 COL5
#' # 1 11fb89c1558c792 2011-05-06 0.150001 4970 4977
#' # 2 11fb89c1558c792 2011-05-06 0.150001 4970 4987
#' # 3 11fb89c1558c792 2011-05-06 0.150001 5200 5528
#' # 4 11fb89c1558c792 2011-05-06 0.150001 4970 5004
#' # 5 11fb89c1558c792 2011-05-06 0.150001 4970 4980
#' # 6 11fb89c1558c792 2011-05-06 0.150001 4970 5020
#' # 7 11fb89c1558c792 2011-05-06 0.150001 4970 5048
#' # 8 11fb89c1558c792 2011-05-06 0.150001 4970 5035
#' # 9 11fb89c1558c792 2011-05-06 0.150001 4970 4971
#' # 10 11fb89c1558c792 2011-05-06 0.150001 4970 4973
#'
#' typeof(frm$COL1)
#' # [1] "double"
#' class(frm$COL1)
#' # [1] "int64"
#'
#' typeof(frm$COL5)
#' # [1] "double"
#' class(frm$COL5)
#' # [1] "int64"
#' }
#' @name csvread
#' @title Fast CSV reader with a given set of column types.
#' @seealso \code{\link{int64}}
#' @keywords csv comma-separated import text
csvread <- function(file, coltypes, header, colnames = NULL, nrows = NULL,
verbose = FALSE, delimiter = ",")
{
if (!is.null(nrows)) nrows <- as.double(nrows)
return(.Call("readCSV", list(filename=file, coltypes=coltypes, nrows=nrows, header=header,
colnames=colnames, verbose=verbose, delimiter=delimiter), PACKAGE="csvread"))
}
#------------------------------------------------------------------------------
#' \code{map.coltypes} guesses the column types in the CSV file by reading the first
#' \code{nrows} lines. The result can be passed to \code{csvread} as the
#' \code{coltypes} argument.
#'
#' @rdname csvread
#' @examples
#' \dontrun{
#' coltypes <- map.coltypes("inst/10rows.csv", header = FALSE)
#' coltypes
#' # V1 V2 V3 V4 V5
#' # "string" "string" "double" "integer" "integer"
#'
#' frm <- csvread(file = "inst/10rows.csv", coltypes = coltypes, header = F, verbose = T)
#' # Counted 10 lines.
#'
#' frm
#' # COL1 COL2 COL3 COL4 COL5
#' # 1 11fb89c1558c792 2011-05-06 0.150001 4970 4977
#' # 2 11fb89c1558c792 2011-05-06 0.150001 4970 4987
#' # 3 11fb89c1558c792 2011-05-06 0.150001 5200 5528
#' # 4 11fb89c1558c792 2011-05-06 0.150001 4970 5004
#' # 5 11fb89c1558c792 2011-05-06 0.150001 4970 4980
#' # 6 11fb89c1558c792 2011-05-06 0.150001 4970 5020
#' # 7 11fb89c1558c792 2011-05-06 0.150001 4970 5048
#' # 8 11fb89c1558c792 2011-05-06 0.150001 4970 5035
#' # 9 11fb89c1558c792 2011-05-06 0.150001 4970 4971
#' # 10 11fb89c1558c792 2011-05-06 0.150001 4970 4973
#' typeof(frm$COL1)
#' # [1] "character"
#' class(frm$COL1)
#' # [1] "character"
#'
#' typeof(frm$COL5)
#' # [1] "integer"
#' class(frm$COL5)
#' # [1] "integer"
#'
#' frm$COL1 <- as.int64(frm$COL1, base = 16)
#' frm$COL1
#' # [1] "11fb89c1558c792" "11fb89c1558c792" "11fb89c1558c792" "11fb89c1558c792"
#' # [5] "11fb89c1558c792" "11fb89c1558c792" "11fb89c1558c792" "11fb89c1558c792"
#' # [9] "11fb89c1558c792" "11fb89c1558c792"
#' typeof(frm$COL1)
#' # [1] "double"
#' class(frm$COL1)
#' # [1] "int64"
#'
#' as.character.int64(frm$COL1[1], base = 10)
#' # [1] "80986298828507026"
#' }
map.coltypes <- function(file, header, nrows = 100, delimiter = ",")
{
df <- read.csv(file, stringsAsFactors = FALSE, header = header, sep = delimiter, nrows = nrows)
coltypes <- unlist(lapply(df, function(x) typeof(x)))
coltypes[coltypes == "logical"] <- "integer"
coltypes[coltypes == "character"] <- "string"
return(coltypes)
}
#------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.