YaleToolkit: Data exploration tools from Yale University.

Documented in big.read.table

#' @title Read in chunks from a large file with row/column filtering
#' to obtain a reasonable-sized data.frame. 
#' @param file the name of the file, obviously
#' @param nrows the chunk size; consider reducing this if there are
#' lots of columns
#' @param sep by default we expect a CSV file
#' @param header is \code{TRUE} by default
#' @param row.names I really dislike row names
#' @param cols for filtering column by name or number (supporting negative indexing)
#' @param rowfilter a function that is assumed to take a chunk as a
#' data frame and return a smaller data frame (with fewer rows), separately
#' from the column filtering.
#' @param as.is \code{TRUE} by default
#' @param estimate do a preliminary estimation of the work to be done,
#' and then have a chance to bail out if it looks like a bad idea
#' @examples
#' data(CO2)
#' write.csv(CO2, "CO2.csv", row.names=FALSE)
#' x <- big.read.table("CO2.csv", nrows=10)
#' head(x)
#' @export
big.read.table <- function(file, nrows=100000, sep=",",
                           header=TRUE, row.names=NULL,
                           cols=NULL, rowfilter=NULL,
                           as.is=TRUE, estimate=FALSE)
{  
  if (estimate) {
    nlines <- getnrows(file)
    x <- read.table(file, sep=sep, row.names=row.names,
                    nrows=min(nlines, 1000), header=header)
    if (!is.null(cols)) x <- x[,cols,drop=FALSE]
    cat("Estimated read size without row filtering:",
        floor(object.size(x)*nlines/nrow(x)/1e6), "MB\n")
    if (interactive()) {
      ANSWER <- readline("Continue with read (Y/n)? ")
      if (substring(ANSWER, 1, 1) != "Y") {
        warning("Terminated read.")
        return(NULL)
      }
    }
  }
  
  iter <- iread.table(file, header=header,
                      row.names=row.names, sep=sep,
                      nrows=nrows, as.is=as.is)
  
  ans <- foreach(x=iter, .combine=rbind) %do% {
    if (!is.null(rowfilter)) x <- rowfilter(x)
    if (!is.null(cols)) x <- x[,cols,drop=FALSE]
    gc()
    return(x)
  }
  
  return(ans)
}