R/TabularTextFile.R

###########################################################################/**
# @RdocClass TabularTextFile
#
# @title "The TabularTextFile class"
#
# \description{
#  @classhierarchy
#
#  A TabularTextFile is an object referring to a tabular text file
#  on a file system containing data in a tabular format.
#  Methods for reading all or a subset of the tabular data exist.
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Arguments passed to @see "GenericTabularFile".}
#   \item{sep}{A @character specifying the symbol used to separate the
#     cell entries.  If more than one symbol is specified, it will try to
#     select the correct one by peeking into the file.}
#   \item{quote}{A @character specifying the quote symbol used, if any.}
#   \item{fill}{As in @see "utils::read.table".}
#   \item{skip}{As in @see "utils::read.table".}
#   \item{columnNames}{A @logical or a @character @vector. If @TRUE,
#      then column names are inferred from the file.  If a @character
#      @vector, then the column names are given by this argument.}
#   \item{commentChar}{A single @character specifying which symbol
#      should be used for comments, cf. @see "utils::read.table".}
#   \item{.verify, verbose}{(Internal only) If @TRUE, the file is
#      verified while the object is instantiated by the constructor.
#      The verbose argument is passed to the verifier function.}
# }
#
# \section{Fields and Methods}{
#  @allmethods "public"
# }
#
# @examples "../incl/TabularTextFile.readDataFrame.Rex"
#
# @author
#
# \seealso{
#   An object of this class is typically part of an @see "TabularTextFileSet".
# }
#*/###########################################################################
setConstructorS3("TabularTextFile", function(..., sep=c("\t", ","), quote="\"", fill=FALSE, skip=0L, columnNames=NA, commentChar="#", .verify=TRUE, verbose=FALSE) {
  # Argument 'columnNames':
  if (is.logical(columnNames)) {
    readColumnNames <- columnNames
    columnNames <- NULL
  } else if (is.character(columnNames)) {
    readColumnNames <- FALSE
  } else {
    throw("Argument 'columnNames' must be either a logical or a character vector: ", class(columnNames)[1])
  }

  # Argument 'commentChar':
  if (identical(commentChar, "") || identical(commentChar, FALSE)) {
    commentChar <- NULL
  } else if (!is.null(commentChar)) {
    commentChar <- Arguments$getCharacter(commentChar, nchar=c(1,1))
  }


  this <- extend(GenericTabularFile(..., .verify=FALSE), "TabularTextFile",
    "cached:.fileHeader" = NULL,
    "cached:.nbrOfLines" = NULL,
    .columnNameTranslator = NULL,
    sep = sep,
    quote = quote,
    fill = fill,
    skip = skip,
    .commentChar = commentChar,
    .columnNames = columnNames,
    readColumnNames = readColumnNames
  )

  if (.verify) {
    verify(this, ..., verbose=verbose)
    # Clear temporarily settings
    this$.fileHeader <- NULL
  }

  this
})


setMethodS3("as.character", "TabularTextFile", function(x, ...) {
  this <- x
  s <- NextMethod()
  colnames <- getColumnNames(this)
  if (length(colnames) > 0L) {
    columns <- paste("'", colnames, "'", sep="")
    s <- c(s, sprintf("Columns [%d]: %s", length(columns), paste(columns, collapse=", ")))
  } else {
    s <- c(s, sprintf("Columns [NA]: <not reading column names>"))
  }
  s <- c(s, sprintf("Number of text lines: %d", nbrOfLines(this, fast=TRUE)))
  s
}, protected=TRUE)


setMethodS3("verify", "TabularTextFile", function(this, ..., verbose=FALSE) {
  # Nothing to do?
  pathname <- getPathname(this)
  if (is.null(pathname) || is.na(pathname))
    return(invisible(this))


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Validating file contents")

  tryCatch({
    data <- readDataFrame(this, nrow=10L, verbose=verbose)
  }, error = function(ex) {
    throw("File format error of the tabular file ('", getPathname(this), "'): ", ex$message)
  })

  verbose && exit(verbose)

  invisible(this)
}, private=TRUE)


setMethodS3("getCommentChar", "TabularTextFile", function(this, ...) {
  this$.commentChar
}, protected=TRUE)


setMethodS3("setCommentChar", "TabularTextFile", function(this, ch, ...) {
  if (!is.null(ch)) {
    ch <- Arguments$getCharacter(ch, nchar=c(1,1))
  }
  this$.commentChar <- ch
  invisible(this)
}, protected=TRUE)


setMethodS3("readColumnNames", "TabularTextFile", function(this, ...) {
  res <- as.logical(this$readColumnNames)

  # No need to infer from header?
  if (!is.na(res)) {
    return(res)
  }

  hdr <- readRawHeader(this, ...)
  namesHdr <- hdr$commentArgs$columnNames
  if (is.null(namesHdr)) {
    msg <- sprintf("Cannot infer whether the data table has column names or not, because header comment argument 'columnNames' is missing. Will assume there are column names (readColumnNames=TRUE): %s", getPathname(this))
#    warning(msg)
    return(TRUE)
  }

  # If argument 'columnNames' equals the first data row, then yes.
  namesData <- hdr$topRows[[1]]
  res <- all(namesData == namesHdr)

  # Store results(?!?)
#  this$readColumnNames <- res

  res
}, protected=TRUE)


###########################################################################/**
# @RdocMethod hasColumnHeader
#
# @title "Checks if there are column names in the header"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Not used.}
# }
#
# \value{
#   Returns a @logical.
# }
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("hasColumnHeader", "TabularTextFile", function(this, ...) {
  readColumnNames(this)
}, protected=TRUE)



###########################################################################/**
# @RdocMethod getDefaultColumnNames
#
# @title "Gets the default column names"
#
# \description{
#  @get "title" by inferring it from the file header.
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Optional arguments passed @seemethod "getHeader".}
# }
#
# \value{
#   Returns @character @vector,
#   or @NULL if there are no column names in the file header.
# }
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("getDefaultColumnNames", "TabularTextFile", function(this, ...) {
  # (a) Hardcoded/user-specified column names?
  names <- this$.columnNames
  if (!is.null(names)) {
    return(names)
  }

  hdr <- getHeader(this, ...)

  # (b) Infer column names from data table
  if (hasColumnHeader(this)) {
    names <- hdr$columns
    return(names)
  }

  # (c) Infer column names from header argument 'columnNames'?
  useHeaderArgs <- this$.useHeaderArgs
  if (is.null(useHeaderArgs)) useHeaderArgs <- TRUE
  if (useHeaderArgs) {
    args <- hdr$commentArgs
    if (length(args) > 0L) {
      names <- args$columnNames
      return(names)
    }
  }

  # (c) There are no column names
  NULL
}, protected=TRUE)


setMethodS3("getDefaultColumnClasses", "TabularTextFile", function(this, ...) {
  ncol <- nbrOfColumns(this)

  hdr <- getHeader(this, ...)

  # (a) Infer column names from header argument 'columnNames'?
  useHeaderArgs <- this$.useHeaderArgs
  if (is.null(useHeaderArgs)) useHeaderArgs <- TRUE
  if (useHeaderArgs) {
    args <- hdr$commentArgs
    if (length(args) > 0L) {
      colClasses <- args$columnClasses
      if (!is.null(colClasses)) {
        # Sanity check
        .stop_if_not(length(colClasses) == ncol)
        return(colClasses)
      }
    }
  }

  # (b) Column classes are not specified in the file
  NULL
}, protected=TRUE)


setMethodS3("getDefaultColumnClassPatterns", "TabularTextFile", function(this, ...) {
  names <- getColumnNames(this, ...)
  colClasses <- getDefaultColumnClasses(this, ...)
  if (is.null(colClasses)) return(NULL)

  names <- sprintf("^%s$", names)
  names(colClasses) <- names
  colClasses
}, protected=TRUE)


###########################################################################/**
# @RdocMethod getHeader
#
# @title "Gets the file header"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Passed to internal @seemethod "readRawHeader".}
#   \item{header}{A @logical specifying whether there are column
#    headers or not.}
#   \item{force}{If @TRUE, an already retrieved header will be ignored.}
# }
#
# \value{
#   Returns a named @list.
# }
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("getHeader", "TabularTextFile", function(this, ..., header=TRUE, force=FALSE) {
  hdr <- this$.fileHeader
  if (force || is.null(hdr) || hasBeenModified(this, update=FALSE)) {
    hdr <- readRawHeader(this, ...)
    if (header && hasColumnHeader(this)) {
      hdr$columns <- hdr$topRows[[1]]
    }
    this$.fileHeader <- hdr
  }
  hdr
})




setMethodS3("readRawHeader", "TabularTextFile", function(this, con=NULL, skip=this$skip, sep=this$sep, ..., verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'skip':
  if (is.null(skip)) {
    skip <- 0L
  } else if (is.character(skip)) {
    skip <- Arguments$getRegularExpression(skip)
  } else {
    skip <- Arguments$getInteger(skip, range=c(0,Inf))
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Reading tabular file header from ", class(this)[1])

  # Open a file connection?
  if (is.null(con)) {
    pathname <- getPathname(this)
    verbose && cat(verbose, "Pathname: ", pathname)

    # Open file connection
    con <- file(pathname, open="r")
    on.exit({
      if (!is.null(con)) {
        close(con)
        con <- NULL
      }
    })
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # How to skip
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  skipMax <- 0L

  # Skip a fixed number of rows?
  if (is.numeric(skip)) {
    skippedLines <- readLines(con, n=skip)
    skipMax <- skipMax + skip
  }

  # Skip until regular expression?
  skipUntil <- NULL
  if (is.character(skip)) {
    skipUntil <- skip
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Header comments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Read header comments
  comments <- c()
  ch <- getCommentChar(this)
  ch <- if (is.null(ch)) "#" else ch
  pattern <- sprintf("^%s", ch)
  ready <- FALSE
  repeat {
    line <- readLines(con, n=1L)
    if (length(line) == 0) break
    isEmpty <- (regexpr("^$", line) != -1L)
    if (!isEmpty) {
      if (!is.null(skipUntil)) {
        if (regexpr(skipUntil, line) != -1L) {
          break
        }
      }
      isComments <- (regexpr(pattern, line) != -1L)
      if (isComments) {
        comments <- c(comments, line)
        skipMax <- skipMax + 1L
      } else if (is.null(skipUntil)) {
        break
      }
    }
  } # repeat

  verbose && cat(verbose, "Header comments:", level=-20)
  verbose && str(verbose, comments, level=-20)


  # Parse header comments
  pattern <- sprintf("^%s[ ]*([^:]+):[ ]*(.*)", ch)
  commentArgs <- grep(pattern, comments, value=TRUE)
  if (length(commentArgs) > 0L) {
    keys <- gsub(pattern, "\\1", commentArgs)
    keys <- trim(keys)
    commentArgs <- gsub(pattern, "\\2", commentArgs)
    commentArgs <- trim(commentArgs)
    commentArgs <- strsplit(commentArgs, split="\t", fixed=TRUE)
    names(commentArgs) <- keys
  } else {
    commentArgs <- NULL
  }
  verbose && cat(verbose, "Header comment arguments:", level=-20)
  verbose && str(verbose, commentArgs, level=-20)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Column separator
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Infer column separator from the first line after the header comments?
  if (length(sep) > 1L) {
    verbose && enter(verbose, "Identifying the separator that returns most columns")
    verbose && cat(verbose, "Line:")
    verbose && print(verbose, line)
    verbose && cat(verbose, "Separators:")
    verbose && str(verbose, sep)
    columns <- base::lapply(sep, FUN=function(split) {
      strsplit(line, split=split)[[1]]
    })
    nbrOfColumns <- sapply(columns, FUN=length)
    max <- which.max(nbrOfColumns)
    sep <- sep[max]
    verbose && printf(verbose, "Choosen separator: '%s' (0x%s)\n", sep, charToRaw(sep))
    verbose && exit(verbose)
  }

  lines <- c(line, readLines(con, n=9))
  verbose && print(verbose, line)
  topRows <- strsplit(lines, split=sep)
  topRows <- lapply(topRows, trim)
  verbose && print(verbose, topRows)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Remove quotes?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  quote <- this$quote
  if (!is.null(quote)) {
    for (pattern in c(sprintf("^%s", quote), sprintf("%s$", quote))) {
      topRows <- lapply(topRows, FUN=function(row) {
        gsub(pattern, "", row)
      })
    }
  }

  verbose && cat(verbose, "Columns: ", paste(paste("'", topRows, "'", sep=""), collapse=", "), level=-10)

  hdr <- list(
    comments=comments,
    commentArgs=commentArgs,
    sep=sep,
    quote=quote,
    skip=skip,
    skipMax=skipMax,
    topRows=topRows
  )

  verbose && str(verbose, hdr)

  verbose && exit(verbose)

  hdr
}, protected=TRUE); # readRawHeader()


setMethodS3("getReadArguments", "TabularTextFile", function(this, fileHeader=NULL, colClasses=c("*"=NA, getDefaultColumnClassPatterns(this)), defColClass="NULL", stringsAsFactors=FALSE, skip=this$skip, ..., verbose=FALSE) {

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'fileHeader':
  if (is.null(fileHeader)) {
    fileHeader <- getHeader(this, skip=skip, ...)
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }



  verbose && enter(verbose, "Building arguments for read.table()")


  # Optional user arguments
  userArgs <- list(stringsAsFactors=stringsAsFactors, ...)
  verbose && cat(verbose, "User arguments:")
  verbose && str(verbose, userArgs)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Infer column classes
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Default column classes
  columns <- getColumnNames(this)
  hasColClassPatterns <- (!is.null(names(colClasses)))
  if (!is.null(columns) && hasColClassPatterns) {
    nbrOfColumns <- length(columns)
    defColClasses <- rep(defColClass, times=nbrOfColumns)
    defColClassPatterns <- defColClasses

    colClassPatterns <- colClasses
    names <- names(colClassPatterns)
    colClasses <- NULL ## Not needed anymore

    # If the same column class name is specified more than ones, then
    # let the latter override the former
    dups <- rev(duplicated(rev(names)))
    if (any(dups)) {
      colClassPatterns <- colClassPatterns[!dups]
      names <- names(colClassPatterns)
    }

    # Default columns?
    pos <- which(names == "*")
    if (length(pos) > 0) {
      # Exclude extra '*':s
      if (length(pos) > 1) {
        colClassPatterns <- colClassPatterns[-(pos[-1])]
        pos <- pos[1]
      }

      # Insert defaults
      colClass <- colClassPatterns[pos]
      names <- names(colClassPatterns)
      if (length(colClassPatterns) > 1) {
        names <- insert(names[-pos], ats=pos, values=rep("*", times=nbrOfColumns))
        idxs <- which(names == "*")
        names[idxs] <- sprintf("^%s$", columns)

        colClassPatterns <- insert(colClassPatterns[-pos], ats=pos,
                                   values=rep("*", times=nbrOfColumns))
        names(colClassPatterns) <- names
        colClassPatterns[idxs] <- colClass
      } else {
        colClassPatterns <- rep(colClass, times=nbrOfColumns)
        names(colClassPatterns) <- sprintf("^%s$", columns)
      }
    } # if (length(pos) > 0)

    verbose && cat(verbose, "Pattern used to identify column classes:", level=-20)
    verbose && print(verbose, colClassPatterns, level=-20)

    verbose && cat(verbose, "Generate column classes:")
    # Read everything by default
    colClasses <- defColClasses
    names(colClasses) <- columns

    # Update column classes according to patterns
    for (kk in seq_along(colClassPatterns)) {
      pattern <- names(colClassPatterns)[kk]
      idxs <- which(regexpr(pattern, columns) != -1)
      if (length(idxs) > 0) {
        colClass <- colClassPatterns[kk]
        colClasses[idxs] <- colClass
      }
    } # for (kk ...)
    colClassPatterns <- NULL ## Not needed anymore
  } else {
    # If column names cannot be inferred, use the default
    nbrOfColumns <- length(fileHeader$topRows[[1]])
    # Where argument 'colClasses' explicitly specified?
    if (!missing(colClasses) && !is.null(colClasses)) {
      # Sanity check
      if (length(colClasses) != nbrOfColumns) {
        throw(sprintf("The number of elements in argument 'colClasses' does not match the number of column in the file: %d != %d", length(colClasses),  nbrOfColumns))
      }
    } else {
      colClasses <- rep(colClasses["*"], times=nbrOfColumns)
    }
  } # if (!is.null(columns) && hasColClassPatterns)

  verbose && cat(verbose, "Column classes:", level=-20)
  verbose && print(verbose, colClasses, level=-20)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Setup read.table() arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Inferred arguments
  ch <- getCommentChar(this)
  args <- list(
    header       = hasColumnHeader(this),
    colClasses   = colClasses,
    skip         = fileHeader$skipMax,
    sep          = fileHeader$sep,
    quote        = fileHeader$quote,
    fill         = this$fill,
    comment.char = ifelse(is.null(ch), "", ch),
    check.names  = FALSE,
    na.strings   = c("---", "NA")
  )

  # Overwrite with user specified arguments, if any
  if (length(userArgs) > 0) {
    verbose && enter(verbose, "Overwriting inferred arguments with user arguments")
    for (key in names(userArgs)) {
      args[[key]] <- userArgs[[key]]
    }
    verbose && exit(verbose)
  }


  # Drop NULL arguments
  keep <- !sapply(args, FUN=is.null)
  args <- args[keep]

  verbose && exit(verbose)


  args
}, protected=TRUE) # getReadArguments()




###########################################################################/**
# @RdocMethod readDataFrame
#
# @title "Reads the tabular data as a data frame"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{con}{(Internal) If a @connection, then it is used, otherwise
#   a new file connection is temporarily opened and used.}
#   \item{rows}{(Optional) An @integer @vector specifying which rows to
#    be read.}
#   \item{nrow}{(Optional) An @integer specifying how many rows to read.
#    If specified, it corresponds to specifying \code{rows=seq_len(nrow)}.}
#   \item{trimQuotes}{(Optional) If @TRUE, quotes are trimmed from numeric
#    columns before parsing them as numerics.  This makes it possible to
#    read quoted numeric values.}
#   \item{...}{Passed to internal @seemethod "getReadArguments".}
#   \item{debug}{If @TRUE, additional details on the file and how it was
#    read is returned as part of the attributes.}
#   \item{verbose}{A @logical or a @see "R.utils::Verbose" object.}
# }
#
# \value{
#   Returns a @data.frame.
# }
#
# \section{Reading quoted numerics}{
#   If a specific data column is specified as being numeric in
#   argument \code{colClasses} and that column contains quoted values
#   it is necessary to use argument \code{trimQuotes=TRUE}, otherwise
#   @see "base::scan" throws an exception similar to:
#   \code{scan() expected 'a real', got '"1.0"'}.
# }
#
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("readDataFrame", "TabularTextFile", function(this, con=NULL, rows=NULL, nrow=NULL, trimQuotes=FALSE, ..., debug=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'rows' and 'nrow':
  if (!is.null(rows) && !is.null(nrow)) {
    throw("Only one of arguments 'rows' and 'nrow' can be specified.")
  }

  if (!is.null(rows)) {
    rows <- Arguments$getIndices(rows)
    nrow <- max(rows)
  }

  if (!is.null(nrow)) {
    nrow <- Arguments$getInteger(nrow, range=c(1,Inf))
  }

  # Argument 'trimQuotes':
  trimQuotes <- Arguments$getLogical(trimQuotes)

  # Argument 'debug':
  debug <- Arguments$getLogical(debug)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Reading ", class(this)[1L])


  # Attributes to be added
  attributes <- list()


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Reading header to infer read.table() arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  hdr <- getHeader(this, ..., verbose=less(verbose, 5))
  attributes$header <- hdr

  # Get read arguments
  args <- getReadArguments(this, fileHeader=hdr, nrows=nrow, ...,
                                               verbose=less(verbose, 5))

  verbose && cat(verbose, "Arguments inferred from file header:")
  verbose && print(verbose, args)
  attributes$readArguments <- args


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Identify names of columns read
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  columns <- getColumnNames(this)
  verbose && printf(verbose, "Column names (%d):\n", length(columns))
  verbose && cat(verbose, paste(columns, collapse=", "))

  if (!is.null(columns)) {
    verbose && enter(verbose, "Matching column names:")
    verbose && printf(verbose, "Column classes (%d):\n",
                                                length(args$colClasses))
    verbose && cat(verbose, paste(args$colClasses, collapse=", "))
    columns[args$colClasses == "NULL"] <- NA
    columns <- columns[!is.na(columns)]
    verbose && exit(verbose)
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Open a file connection?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (is.null(con)) {
    pathname <- getPathname(this)
    verbose && cat(verbose, "Pathname: ", pathname)

    # Open file connection
    con <- file(pathname, open="r")
    on.exit({
      if (!is.null(con)) {
        close(con)
        con <- NULL
      }
    })
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Reading data
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Calling read.table()")


  # SPECIAL CASE/WORKAROUND: read.table()/scan() will give an error
  # if a numeric value is quoted and 'colClasses' specifies it as
  # a numeric value.  In order to read such values, we need to remove
  # the quotes first. /HB 2011-07-13

  # Check if we need to trim quotes
  trimQuotes <- (trimQuotes && nchar(args$quote) > 0L)
  trimQuotes <- (trimQuotes && length(args$colClasses) > 0L)
  if (trimQuotes) {
    classesToPatch <- c("integer", "numeric", "double", "complex")
    toPatch <- is.element(args$colClasses, classesToPatch)
    trimQuotes <- c(trimQuotes && any(toPatch))
  }

  # Read by first trimming quotes?
  if (trimQuotes) {
    verbose && enter(verbose, "Trimming quotes from numeric columns before parsing them as numbers")
    # This is a workaround for reading quoted numerics. /HB 2011-07-13
    verbose && cat(verbose, "Columns that need to have quotes trimmed:")
    verbose && str(verbose, which(toPatch))
    args0 <- args

    # (1) Read to-be-patched columns as characters.
    colClasses <- args$colClasses
    colClasses[toPatch] <- "character"
    args$colClasses <- colClasses
  }

  verbose && cat(verbose, "Arguments used to read tabular file:")
  ## Since R v3.2.1 rev 68837, read.table() now utilized (fixed)
  ## named 'colClasses' similar to readDataFrame().  For now, make
  ## sure to pass non-named 'colClasses' to read.table().
  ## FIX ME: Should we rewrite readDataFrame() to better utilize
  ## that read.table() now also looks at fixed names? /HB 2015-08-06
  args$colClasses <- unname(args$colClasses)
  args <- c(list(con), args)
  verbose && print(verbose, args)
  data <- do.call(read.table, args=args)
  nbrOfRowsRead <- nrow(data)
  verbose && cat(verbose, "Raw data read by read.table():")
  verbose && str(verbose, data)
  verbose && cat(verbose, "Number of rows read: ", nbrOfRowsRead)
  verbose && cat(verbose, "Number of columns read: ", ncol(data))

  # Extract subset of rows?
  if (!is.null(rows)) {
    if (max(rows) > nbrOfRowsRead) {
      rows <- intersect(rows, 1:nbrOfRowsRead)
      warning("Argument 'rows' was out of range [1,", nbrOfRowsRead,
                                "]. Ignored rows beyond this range.")
    }
    data <- data[rows,,drop=FALSE]
  } else {
    rownames(data) <- NULL
  }

  # Was data read by first trimming quotes?
  if (trimQuotes) {
    # (2) Re-read numeric columns one by one
    colClasses <- args0$colClasses

    # Note that some columns may have been ignored
    keep <- (colClasses != "NULL")
    colClasses <- colClasses[keep]
    toPatch <- toPatch[keep]

    # Sanity check
    .stop_if_not(length(colClasses) == ncol(data))

    na.strings <- args0$na.strings

    verbose && enter(verbose, "Parsing numeric columns")
    toPatch <- which(toPatch)
    for (kk in seq_along(toPatch)) {
      col <- toPatch[kk]
      colClass <- colClasses[col]
      verbose && enter(verbose, sprintf("Parsing #%d (column #%d as '%s') of %d", kk, col, colClass, length(toPatch)))
      values <- data[[col]]

      # Quick/bad way, but will turn non-valid values into NA
      ##  storage.mode(values) <- colClass

      verbose && cat(verbose, "Non-parsed values:")
      verbose && str(verbose, values)

      bfr <- paste(values, collapse="\n")
      conT <- textConnection(bfr, open="r")
      on.exit({
        if (!is.null(conT)) close(conT)
      }, add=TRUE)

      # Try to read the values as the correct type.
      # (Could scan(), which has less overhead, be used here? /HB 2013-09-30)
      valuesT <- read.table(file=conT, quote="", colClasses=colClass, na.strings=na.strings, blank.lines.skip=FALSE)[[1L]]

      verbose && cat(verbose, "Parsed values:")
      verbose && str(verbose, valuesT)

      # Sanity check
      .stop_if_not(length(valuesT) == length(values))

      close(conT); conT <- NULL
      data[[col]] <- valuesT

      # Not needed anymore
      bfr <- values <- valuesT <- NULL
      verbose && exit(verbose)
    } # for (kk ...)
    verbose && exit(verbose)

    verbose && exit(verbose)
  } # if (trimQuotes)


  # Sanity check
  if (length(columns) > 0L) {
    if (ncol(data) != length(columns)) {
      throw("Number of read data columns does not match the number of column headers: ", ncol(data), " != ", length(columns))
    }
    colnames(data) <- columns
  }

  verbose && str(verbose, data)
  verbose && exit(verbose)

  if (debug) {
    attr(data, "fileHeader") <- hdr
    for (key in names(attributes)) {
      attr(data, key) <- attributes[[key]]
    }
  }

  verbose && exit(verbose)

  data
}) # readDataFrame()



setMethodS3("readColumns", "TabularTextFile", function(this, columns=seq_len(ncol(this)), colClasses=NULL, ..., check.names=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Arguments 'columns':
  maxNbrOfColumns <- ncol(this)
  if (is.null(columns)) {
    columnNames <- getColumnNames(this)
  } else if (is.numeric(columns)) {
    columns <- Arguments$getIndices(columns, max=maxNbrOfColumns)
    columnNames <- getColumnNames(this)
    if (!is.null(columnNames)) {
      columnNames <- columnNames[columns]
    }
  } else {
    columnNames <- Arguments$getCharacters(columns)
  }

  # Argument 'colClasses':
  if (!is.null(colClasses)) {
    names <- names(colClasses)
    colClasses <- Arguments$getCharacters(colClasses)
    names(colClasses) <- names
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Reading columns")
  verbose && cat(verbose, "Argument 'columns': ", hpaste(columns))
  verbose && cat(verbose, "Argument 'colClasses':")
  verbose && print(verbose, colClasses)

  verbose && cat(verbose, "Column names': ", hpaste(columnNames))
  # Setup column classes, iff missing
  if (is.null(colClasses)) {
    if (is.null(columnNames)) {
      colClasses[-columns] <- "NULL"
    } else {
      colClasses <- rep("character", times=length(columnNames))
      names(colClasses) <- sprintf("^%s$", columnNames)
    }
  } else {
    if (is.null(names(colClasses))) {
      names(colClasses) <- sprintf("^%s$", columnNames)
    }
  }

  verbose && cat(verbose, "Column classes:")
  verbose && print(verbose, colClasses)

  data <- readDataFrame(this, colClasses=colClasses, ..., verbose=less(verbose, 50))

  # Subset
  if (ncol(data) > 0L) {
    verbose && enter(verbose, "Subsetting columns")
    verbose && str(verbose, data)

    if (!is.null(columnNames)) {
      columns <- match(columnNames, names(data))
    }
    verbose && cat(verbose, "Columns to keep: ", hpaste(columns))

    # Sanity check
    columns <- Arguments$getIndices(columns, max=ncol(data))

    # Need to rearrange?
    if (any(diff(columns) != 1L) || ncol(data) > length(columns)) {
      data <- data[,columns,drop=FALSE]
      if (!check.names) {
        colnames(data) <- columnNames
      }
    }

    verbose && exit(verbose)
  }

  # Sanity check
  .stop_if_not(ncol(data) == length(columns))

  verbose && exit(verbose)

  data
}, protected=TRUE)



# AD HOC fix to speed up ll(), which calls dimension() on each object,
# which in turn calls dim() and dim() is really slow for this class,
# because it has to infer the number of rows by reading the complete
# file. The fix is to return NA for the number of rows if the file size
# is larger than 10MB unless nbrOfLines() has already been called.
# /HB 2008-07-22
setMethodS3("dimension", "TabularTextFile", function(this, ...) {
  c(nbrOfRows(this, fast=TRUE), nbrOfColumns(this, fast=TRUE))
}, private=TRUE)



###########################################################################/**
# @RdocMethod nbrOfRows
#
# @title "Counts the number of data rows"
#
# \description{
#  @get "title".  The count does not include the header rows or comments.
# }
#
# @synopsis
#
# \arguments{
#   \item{fast}{Argument passed to @seemethod "nbrOfLines".}
#   \item{...}{Optional arguments passed to @seemethod "getHeader".}
# }
#
# \value{
#   Returns a @character @vector.
# }
# @author
#
# \seealso{
#    The number of data rows is always less or equal to the number of lines
#    as returned by @seemethod "nbrOfLines".
#   Internally, @see "R.utils::countLines" is used.
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("nbrOfRows", "TabularTextFile", function(this, fast=FALSE, ...) {
  hdr <- getHeader(this, ...)

  nbrOfCommentRows <- length(hdr$comments)
  nbrOfRowsToSkip <- hdr$skip
  hasColumnNames <- (length(hdr$columns) > 0L)
  nbrOfNonDataRows <- nbrOfCommentRows + nbrOfRowsToSkip +
                      as.integer(hasColumnNames)

  n <- nbrOfLines(this, fast=fast)
  n <- n - nbrOfNonDataRows
  n <- as.integer(n)

  n
})



###########################################################################/**
# @RdocMethod nbrOfLines
#
# @title "Counts the number of lines in the data file"
#
# \description{
#  @get "title".  The count include header rows, comments and more.
# }
#
# @synopsis
#
# \arguments{
#   \item{fast}{If @TRUE, @NA is returned for large data files (>1Mb),
#     unless the number of lines has already been counted.}
#   \item{...}{Optional arguments passed to @see "R.utils::countLines".}
# }
#
# \value{
#   Returns a @character @vector.
# }
#
# @author
#
# \seealso{
#    To count the number of data rows is the data table,
#    use @seemethod "nbrOfRows".
#   Internally, @see "R.utils::countLines" is used.
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("nbrOfLines", "TabularTextFile", function(this, fast=FALSE, ...) {
  pathname <- getPathname(this)

  n <- this$.nbrOfLines
  mtime <- getLastModifiedOn(this)
  if (is.null(n) || is.na(mtime) || !identical(mtime, attr(n, "mtime"))) {
    if (fast) {
      if (getFileSize(this) < 10e6)
        fast <- FALSE
    }

    if (fast) {
      n <- NA_integer_
    } else {
      n <- countLines(pathname, ...)
      attr(n, "mtime") <- mtime
      this$.nbrOfLines <- n
    }
  }

  n
})




###########################################################################/**
# @RdocMethod readLines
#
# @title "Reads the lines of the data file as strings"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Optional arguments passed to @see "base::readLines".}
# }
#
# \value{
#   Returns a @character @vector.
# }
# @author
#
# \seealso{
#   @seemethod "readDataFrame".
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("readLines", "TabularTextFile", function(con, ...) {
  # To please R CMD check
  this <- con
  pathname <- getPathname(this)
  readLines(pathname, ...)
})

Try the R.filesets package in your browser

Any scripts or data that you put into this service are public.

R.filesets documentation built on July 21, 2022, 5:11 p.m.