R/TabularTextFile.R
In R.filesets: Easy Handling of and Access to Files Organized in Structured Directories

###########################################################################/**
# @RdocClass TabularTextFile
#
# @title "The TabularTextFile class"
#
# \description{
#  @classhierarchy
#
#  A TabularTextFile is an object referring to a tabular text file
#  on a file system containing data in a tabular format.
#  Methods for reading all or a subset of the tabular data exist.
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Arguments passed to @see "GenericTabularFile".}
#   \item{sep}{A @character specifying the symbol used to separate the
#     cell entries.  If more than one symbol is specified, it will try to
#     select the correct one by peeking into the file.}
#   \item{quote}{A @character specifying the quote symbol used, if any.}
#   \item{fill}{As in @see "utils::read.table".}
#   \item{skip}{As in @see "utils::read.table".}
#   \item{columnNames}{A @logical or a @character @vector. If @TRUE,
#      then column names are inferred from the file.  If a @character
#      @vector, then the column names are given by this argument.}
#   \item{commentChar}{A single @character specifying which symbol
#      should be used for comments, cf. @see "utils::read.table".}
#   \item{.verify, verbose}{(Internal only) If @TRUE, the file is
#      verified while the object is instantiated by the constructor.
#      The verbose argument is passed to the verifier function.}
# }
#
# \section{Fields and Methods}{
#  @allmethods "public"
# }
#
# @examples "../incl/TabularTextFile.readDataFrame.Rex"
#
# @author
#
# \seealso{
#   An object of this class is typically part of an @see "TabularTextFileSet".
# }
#*/###########################################################################
setConstructorS3("TabularTextFile", function(..., sep=c("\t", ","), quote="\"", fill=FALSE, skip=0L, columnNames=NA, commentChar="#", .verify=TRUE, verbose=FALSE) {
  # Argument 'columnNames':
  if (is.logical(columnNames)) {
    readColumnNames <- columnNames
    columnNames <- NULL
  } else if (is.character(columnNames)) {
    readColumnNames <- FALSE
  } else {
    throw("Argument 'columnNames' must be either a logical or a character vector: ", class(columnNames)[1])
  }

  # Argument 'commentChar':
  if (identical(commentChar, "") || identical(commentChar, FALSE)) {
    commentChar <- NULL
  } else if (!is.null(commentChar)) {
    commentChar <- Arguments$getCharacter(commentChar, nchar=c(1,1))
  }


  this <- extend(GenericTabularFile(..., .verify=FALSE), "TabularTextFile",
    "cached:.fileHeader" = NULL,
    "cached:.nbrOfLines" = NULL,
    .columnNameTranslator = NULL,
    sep = sep,
    quote = quote,
    fill = fill,
    skip = skip,
    .commentChar = commentChar,
    .columnNames = columnNames,
    readColumnNames = readColumnNames
  )

  if (.verify) {
    verify(this, ..., verbose=verbose)
    # Clear temporarily settings
    this$.fileHeader <- NULL
  }

  this
})


setMethodS3("as.character", "TabularTextFile", function(x, ...) {
  this <- x
  s <- NextMethod()
  colnames <- getColumnNames(this)
  if (length(colnames) > 0L) {
    columns <- paste("'", colnames, "'", sep="")
    s <- c(s, sprintf("Columns [%d]: %s", length(columns), paste(columns, collapse=", ")))
  } else {
    s <- c(s, sprintf("Columns [NA]: <not reading column names>"))
  }
  s <- c(s, sprintf("Number of text lines: %d", nbrOfLines(this, fast=TRUE)))
  s
}, protected=TRUE)


setMethodS3("verify", "TabularTextFile", function(this, ..., verbose=FALSE) {
  # Nothing to do?
  pathname <- getPathname(this)
  if (is.null(pathname) || is.na(pathname))
    return(invisible(this))


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Validating file contents")

  tryCatch({
    data <- readDataFrame(this, nrow=10L, verbose=verbose)
  }, error = function(ex) {
    throw("File format error of the tabular file ('", getPathname(this), "'): ", ex$message)
  })

  verbose && exit(verbose)

  invisible(this)
}, private=TRUE)


setMethodS3("getCommentChar", "TabularTextFile", function(this, ...) {
  this$.commentChar
}, protected=TRUE)


setMethodS3("setCommentChar", "TabularTextFile", function(this, ch, ...) {
  if (!is.null(ch)) {
    ch <- Arguments$getCharacter(ch, nchar=c(1,1))
  }
  this$.commentChar <- ch
  invisible(this)
}, protected=TRUE)


setMethodS3("readColumnNames", "TabularTextFile", function(this, ...) {
  res <- as.logical(this$readColumnNames)

  # No need to infer from header?
  if (!is.na(res)) {
    return(res)
  }

  hdr <- readRawHeader(this, ...)
  namesHdr <- hdr$commentArgs$columnNames
  if (is.null(namesHdr)) {
    msg <- sprintf("Cannot infer whether the data table has column names or not, because header comment argument 'columnNames' is missing. Will assume there are column names (readColumnNames=TRUE): %s", getPathname(this))
#    warning(msg)
    return(TRUE)
  }

  # If argument 'columnNames' equals the first data row, then yes.
  namesData <- hdr$topRows[[1]]
  res <- all(namesData == namesHdr)

  # Store results(?!?)
#  this$readColumnNames <- res

  res
}, protected=TRUE)


###########################################################################/**
# @RdocMethod hasColumnHeader
#
# @title "Checks if there are column names in the header"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Not used.}
# }
#
# \value{
#   Returns a @logical.
# }
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("hasColumnHeader", "TabularTextFile", function(this, ...) {
  readColumnNames(this)
}, protected=TRUE)



###########################################################################/**
# @RdocMethod getDefaultColumnNames
#
# @title "Gets the default column names"
#
# \description{
#  @get "title" by inferring it from the file header.
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Optional arguments passed @seemethod "getHeader".}
# }
#
# \value{
#   Returns @character @vector,
#   or @NULL if there are no column names in the file header.
# }
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("getDefaultColumnNames", "TabularTextFile", function(this, ...) {
  # (a) Hardcoded/user-specified column names?
  names <- this$.columnNames
  if (!is.null(names)) {
    return(names)
  }

  hdr <- getHeader(this, ...)

  # (b) Infer column names from data table
  if (hasColumnHeader(this)) {
    names <- hdr$columns
    return(names)
  }

  # (c) Infer column names from header argument 'columnNames'?
  useHeaderArgs <- this$.useHeaderArgs
  if (is.null(useHeaderArgs)) useHeaderArgs <- TRUE
  if (useHeaderArgs) {
    args <- hdr$commentArgs
    if (length(args) > 0L) {
      names <- args$columnNames
      return(names)
    }
  }

  # (c) There are no column names
  NULL
}, protected=TRUE)


setMethodS3("getDefaultColumnClasses", "TabularTextFile", function(this, ...) {
  ncol <- nbrOfColumns(this)

  hdr <- getHeader(this, ...)

  # (a) Infer column names from header argument 'columnNames'?
  useHeaderArgs <- this$.useHeaderArgs
  if (is.null(useHeaderArgs)) useHeaderArgs <- TRUE
  if (useHeaderArgs) {
    args <- hdr$commentArgs
    if (length(args) > 0L) {
      colClasses <- args$columnClasses
      if (!is.null(colClasses)) {
        # Sanity check
        .stop_if_not(length(colClasses) == ncol)
        return(colClasses)
      }
    }
  }

  # (b) Column classes are not specified in the file
  NULL
}, protected=TRUE)


setMethodS3("getDefaultColumnClassPatterns", "TabularTextFile", function(this, ...) {
  names <- getColumnNames(this, ...)
  colClasses <- getDefaultColumnClasses(this, ...)
  if (is.null(colClasses)) return(NULL)

  names <- sprintf("^%s$", names)
  names(colClasses) <- names
  colClasses
}, protected=TRUE)


###########################################################################/**
# @RdocMethod getHeader
#
# @title "Gets the file header"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Passed to internal @seemethod "readRawHeader".}
#   \item{header}{A @logical specifying whether there are column
#    headers or not.}
#   \item{force}{If @TRUE, an already retrieved header will be ignored.}
# }
#
# \value{
#   Returns a named @list.
# }
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("getHeader", "TabularTextFile", function(this, ..., header=TRUE, force=FALSE) {
  hdr <- this$.fileHeader
  if (force || is.null(hdr) || hasBeenModified(this, update=FALSE)) {
    hdr <- readRawHeader(this, ...)
    if (header && hasColumnHeader(this)) {
      hdr$columns <- hdr$topRows[[1]]
    }
    this$.fileHeader <- hdr
  }
  hdr
})




setMethodS3("readRawHeader", "TabularTextFile", function(this, con=NULL, skip=this$skip, sep=this$sep, ..., verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'skip':
  if (is.null(skip)) {
    skip <- 0L
  } else if (is.character(skip)) {
    skip <- Arguments$getRegularExpression(skip)
  } else {
    skip <- Arguments$getInteger(skip, range=c(0,Inf))
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Reading tabular file header from ", class(this)[1])

  # Open a file connection?
  if (is.null(con)) {
    pathname <- getPathname(this)
    verbose && cat(verbose, "Pathname: ", pathname)

    # Open file connection
    con <- file(pathname, open="r")
    on.exit({
      if (!is.null(con)) {
        close(con)
        con <- NULL
      }
    })
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # How to skip
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  skipMax <- 0L

  # Skip a fixed number of rows?
  if (is.numeric(skip)) {
    skippedLines <- readLines(con, n=skip)
    skipMax <- skipMax + skip
  }

  # Skip until regular expression?
  skipUntil <- NULL
  if (is.character(skip)) {
    skipUntil <- skip
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Header comments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Read header comments
  comments <- c()
  ch <- getCommentChar(this)
  ch <- if (is.null(ch)) "#" else ch
  pattern <- sprintf("^%s", ch)
  ready <- FALSE
  repeat {
    line <- readLines(con, n=1L)
    if (length(line) == 0) break
    isEmpty <- (regexpr("^$", line) != -1L)
    if (!isEmpty) {
      if (!is.null(skipUntil)) {
        if (regexpr(skipUntil, line) != -1L) {
          break
        }
      }
      isComments <- (regexpr(pattern, line) != -1L)
      if (isComments) {
        comments <- c(comments, line)
        skipMax <- skipMax + 1L
      } else if (is.null(skipUntil)) {
        break
      }
    }
  } # repeat

  verbose && cat(verbose, "Header comments:", level=-20)
  verbose && str(verbose, comments, level=-20)


  # Parse header comments
  pattern <- sprintf("^%s[ ]*([^:]+):[ ]*(.*)", ch)
  commentArgs <- grep(pattern, comments, value=TRUE)
  if (length(commentArgs) > 0L) {
    keys <- gsub(pattern, "\\1", commentArgs)
    keys <- trim(keys)
    commentArgs <- gsub(pattern, "\\2", commentArgs)
    commentArgs <- trim(commentArgs)
    commentArgs <- strsplit(commentArgs, split="\t", fixed=TRUE)
    names(commentArgs) <- keys
  } else {
    commentArgs <- NULL
  }
  verbose && cat(verbose, "Header comment arguments:", level=-20)
  verbose && str(verbose, commentArgs, level=-20)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Column separator
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Infer column separator from the first line after the header comments?
  if (length(sep) > 1L) {
    verbose && enter(verbose, "Identifying the separator that returns most columns")
    verbose && cat(verbose, "Line:")
    verbose && print(verbose, line)
    verbose && cat(verbose, "Separators:")
    verbose && str(verbose, sep)
    columns <- base::lapply(sep, FUN=function(split) {
      strsplit(line, split=split)[[1]]
    })
    nbrOfColumns <- sapply(columns, FUN=length)
    max <- which.max(nbrOfColumns)
    sep <- sep[max]
    verbose && printf(verbose, "Choosen separator: '%s' (0x%s)\n", sep, charToRaw(sep))
    verbose && exit(verbose)
  }

  lines <- c(line, readLines(con, n=9))
  verbose && print(verbose, line)
  topRows <- strsplit(lines, split=sep)
  topRows <- lapply(topRows, trim)
  verbose && print(verbose, topRows)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Remove quotes?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  quote <- this$quote
  if (!is.null(quote)) {
    for (pattern in c(sprintf("^%s", quote), sprintf("%s$", quote))) {
      topRows <- lapply(topRows, FUN=function(row) {
        gsub(pattern, "", row)
      })
    }
  }

  verbose && cat(verbose, "Columns: ", paste(paste("'", topRows, "'", sep=""), collapse=", "), level=-10)

  hdr <- list(
    comments=comments,
    commentArgs=commentArgs,
    sep=sep,
    quote=quote,
    skip=skip,
    skipMax=skipMax,
    topRows=topRows
  )

  verbose && str(verbose, hdr)

  verbose && exit(verbose)

  hdr
}, protected=TRUE); # readRawHeader()


setMethodS3("getReadArguments", "TabularTextFile", function(this, fileHeader=NULL, colClasses=c("*"=NA, getDefaultColumnClassPatterns(this)), defColClass="NULL", stringsAsFactors=FALSE, skip=this$skip, ..., verbose=FALSE) {

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'fileHeader':
  if (is.null(fileHeader)) {
    fileHeader <- getHeader(this, skip=skip, ...)
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }



  verbose && enter(verbose, "Building arguments for read.table()")


  # Optional user arguments
  userArgs <- list(stringsAsFactors=stringsAsFactors, ...)
  verbose && cat(verbose, "User arguments:")
  verbose && str(verbose, userArgs)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Infer column classes
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Default column classes
  columns <- getColumnNames(this)
  hasColClassPatterns <- (!is.null(names(colClasses)))
  if (!is.null(columns) && hasColClassPatterns) {
    nbrOfColumns <- length(columns)
    defColClasses <- rep(defColClass, times=nbrOfColumns)
    defColClassPatterns <- defColClasses

    colClassPatterns <- colClasses
    names <- names(colClassPatterns)
    colClasses <- NULL ## Not needed anymore

    # If the same column class name is specified more than ones, then
    # let the latter override the former
    dups <- rev(duplicated(rev(names)))
    if (any(dups)) {
      colClassPatterns <- colClassPatterns[!dups]
      names <- names(colClassPatterns)
    }

    # Default columns?
    pos <- which(names == "*")
    if (length(pos) > 0) {
      # Exclude extra '*':s
      if (length(pos) > 1) {
        colClassPatterns <- colClassPatterns[-(pos[-1])]
        pos <- pos[1]
      }

      # Insert defaults
      colClass <- colClassPatterns[pos]
      names <- names(colClassPatterns)
      if (length(colClassPatterns) > 1) {
        names <- insert(names[-pos], ats=pos, values=rep("*", times=nbrOfColumns))
        idxs <- which(names == "*")
        names[idxs] <- sprintf("^%s$", columns)

        colClassPatterns <- insert(colClassPatterns[-pos], ats=pos,
                                   values=rep("*", times=nbrOfColumns))
        names(colClassPatterns) <- names
        colClassPatterns[idxs] <- colClass
      } else {
        colClassPatterns <- rep(colClass, times=nbrOfColumns)
        names(colClassPatterns) <- sprintf("^%s$", columns)
      }
    } # if (length(pos) > 0)

    verbose && cat(verbose, "Pattern used to identify column classes:", level=-20)
    verbose && print(verbose, colClassPatterns, level=-20)

    verbose && cat(verbose, "Generate column classes:")
    # Read everything by default
    colClasses <- defColClasses
    names(colClasses) <- columns

    # Update column classes according to patterns
    for (kk in seq_along(colClassPatterns)) {
      pattern <- names(colClassPatterns)[kk]
      idxs <- which(regexpr(pattern, columns) != -1)
      if (length(idxs) > 0) {
        colClass <- colClassPatterns[kk]
        colClasses[idxs] <- colClass
      }
    } # for (kk ...)
    colClassPatterns <- NULL ## Not needed anymore
  } else {
    # If column names cannot be inferred, use the default
    nbrOfColumns <- length(fileHeader$topRows[[1]])
    # Where argument 'colClasses' explicitly specified?
    if (!missing(colClasses) && !is.null(colClasses)) {
      # Sanity check
      if (length(colClasses) != nbrOfColumns) {
        throw(sprintf("The number of elements in argument 'colClasses' does not match the number of column in the file: %d != %d", length(colClasses),  nbrOfColumns))
      }
    } else {
      colClasses <- rep(colClasses["*"], times=nbrOfColumns)
    }
  } # if (!is.null(columns) && hasColClassPatterns)

  verbose && cat(verbose, "Column classes:", level=-20)
  verbose && print(verbose, colClasses, level=-20)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Setup read.table() arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Inferred arguments
  ch <- getCommentChar(this)
  args <- list(
    header       = hasColumnHeader(this),
    colClasses   = colClasses,
    skip         = fileHeader$skipMax,
    sep          = fileHeader$sep,
    quote        = fileHeader$quote,
    fill         = this$fill,
    comment.char = ifelse(is.null(ch), "", ch),
    check.names  = FALSE,
    na.strings   = c("---", "NA")
  )

  # Overwrite with user specified arguments, if any
  if (length(userArgs) > 0) {
    verbose && enter(verbose, "Overwriting inferred arguments with user arguments")
    for (key in names(userArgs)) {
      args[[key]] <- userArgs[[key]]
    }
    verbose && exit(verbose)
  }


  # Drop NULL arguments
  keep <- !sapply(args, FUN=is.null)
  args <- args[keep]

  verbose && exit(verbose)


  args
}, protected=TRUE) # getReadArguments()




###########################################################################/**
# @RdocMethod readDataFrame
#
# @title "Reads the tabular data as a data frame"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{con}{(Internal) If a @connection, then it is used, otherwise
#   a new file connection is temporarily opened and used.}
#   \item{rows}{(Optional) An @integer @vector specifying which rows to
#    be read.}
#   \item{nrow}{(Optional) An @integer specifying how many rows to read.
#    If specified, it corresponds to specifying \code{rows=seq_len(nrow)}.}
#   \item{trimQuotes}{(Optional) If @TRUE, quotes are trimmed from numeric
#    columns before parsing them as numerics.  This makes it possible to
#    read quoted numeric values.}
#   \item{...}{Passed to internal @seemethod "getReadArguments".}
#   \item{debug}{If @TRUE, additional details on the file and how it was
#    read is returned as part of the attributes.}
#   \item{verbose}{A @logical or a @see "R.utils::Verbose" object.}
# }
#
# \value{
#   Returns a @data.frame.
# }
#
# \section{Reading quoted numerics}{
#   If a specific data column is specified as being numeric in
#   argument \code{colClasses} and that column contains quoted values
#   it is necessary to use argument \code{trimQuotes=TRUE}, otherwise
#   @see "base::scan" throws an exception similar to:
#   \code{scan() expected 'a real', got '"1.0"'}.
# }
#
# @author
#
# \seealso{
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("readDataFrame", "TabularTextFile", function(this, con=NULL, rows=NULL, nrow=NULL, trimQuotes=FALSE, ..., debug=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'rows' and 'nrow':
  if (!is.null(rows) && !is.null(nrow)) {
    throw("Only one of arguments 'rows' and 'nrow' can be specified.")
  }

  if (!is.null(rows)) {
    rows <- Arguments$getIndices(rows)
    nrow <- max(rows)
  }

  if (!is.null(nrow)) {
    nrow <- Arguments$getInteger(nrow, range=c(1,Inf))
  }

  # Argument 'trimQuotes':
  trimQuotes <- Arguments$getLogical(trimQuotes)

  # Argument 'debug':
  debug <- Arguments$getLogical(debug)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Reading ", class(this)[1L])


  # Attributes to be added
  attributes <- list()


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Reading header to infer read.table() arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  hdr <- getHeader(this, ..., verbose=less(verbose, 5))
  attributes$header <- hdr

  # Get read arguments
  args <- getReadArguments(this, fileHeader=hdr, nrows=nrow, ...,
                                               verbose=less(verbose, 5))

  verbose && cat(verbose, "Arguments inferred from file header:")
  verbose && print(verbose, args)
  attributes$readArguments <- args


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Identify names of columns read
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  columns <- getColumnNames(this)
  verbose && printf(verbose, "Column names (%d):\n", length(columns))
  verbose && cat(verbose, paste(columns, collapse=", "))

  if (!is.null(columns)) {
    verbose && enter(verbose, "Matching column names:")
    verbose && printf(verbose, "Column classes (%d):\n",
                                                length(args$colClasses))
    verbose && cat(verbose, paste(args$colClasses, collapse=", "))
    columns[args$colClasses == "NULL"] <- NA
    columns <- columns[!is.na(columns)]
    verbose && exit(verbose)
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Open a file connection?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (is.null(con)) {
    pathname <- getPathname(this)
    verbose && cat(verbose, "Pathname: ", pathname)

    # Open file connection
    con <- file(pathname, open="r")
    on.exit({
      if (!is.null(con)) {
        close(con)
        con <- NULL
      }
    })
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Reading data
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Calling read.table()")


  # SPECIAL CASE/WORKAROUND: read.table()/scan() will give an error
  # if a numeric value is quoted and 'colClasses' specifies it as
  # a numeric value.  In order to read such values, we need to remove
  # the quotes first. /HB 2011-07-13

  # Check if we need to trim quotes
  trimQuotes <- (trimQuotes && nchar(args$quote) > 0L)
  trimQuotes <- (trimQuotes && length(args$colClasses) > 0L)
  if (trimQuotes) {
    classesToPatch <- c("integer", "numeric", "double", "complex")
    toPatch <- is.element(args$colClasses, classesToPatch)
    trimQuotes <- c(trimQuotes && any(toPatch))
  }

  # Read by first trimming quotes?
  if (trimQuotes) {
    verbose && enter(verbose, "Trimming quotes from numeric columns before parsing them as numbers")
    # This is a workaround for reading quoted numerics. /HB 2011-07-13
    verbose && cat(verbose, "Columns that need to have quotes trimmed:")
    verbose && str(verbose, which(toPatch))
    args0 <- args

    # (1) Read to-be-patched columns as characters.
    colClasses <- args$colClasses
    colClasses[toPatch] <- "character"
    args$colClasses <- colClasses
  }

  verbose && cat(verbose, "Arguments used to read tabular file:")
  ## Since R v3.2.1 rev 68837, read.table() now utilized (fixed)
  ## named 'colClasses' similar to readDataFrame().  For now, make
  ## sure to pass non-named 'colClasses' to read.table().
  ## FIX ME: Should we rewrite readDataFrame() to better utilize
  ## that read.table() now also looks at fixed names? /HB 2015-08-06
  args$colClasses <- unname(args$colClasses)
  args <- c(list(con), args)
  verbose && print(verbose, args)
  data <- do.call(read.table, args=args)
  nbrOfRowsRead <- nrow(data)
  verbose && cat(verbose, "Raw data read by read.table():")
  verbose && str(verbose, data)
  verbose && cat(verbose, "Number of rows read: ", nbrOfRowsRead)
  verbose && cat(verbose, "Number of columns read: ", ncol(data))

  # Extract subset of rows?
  if (!is.null(rows)) {
    if (max(rows) > nbrOfRowsRead) {
      rows <- intersect(rows, 1:nbrOfRowsRead)
      warning("Argument 'rows' was out of range [1,", nbrOfRowsRead,
                                "]. Ignored rows beyond this range.")
    }
    data <- data[rows,,drop=FALSE]
  } else {
    rownames(data) <- NULL
  }

  # Was data read by first trimming quotes?
  if (trimQuotes) {
    # (2) Re-read numeric columns one by one
    colClasses <- args0$colClasses

    # Note that some columns may have been ignored
    keep <- (colClasses != "NULL")
    colClasses <- colClasses[keep]
    toPatch <- toPatch[keep]

    # Sanity check
    .stop_if_not(length(colClasses) == ncol(data))

    na.strings <- args0$na.strings

    verbose && enter(verbose, "Parsing numeric columns")
    toPatch <- which(toPatch)
    for (kk in seq_along(toPatch)) {
      col <- toPatch[kk]
      colClass <- colClasses[col]
      verbose && enter(verbose, sprintf("Parsing #%d (column #%d as '%s') of %d", kk, col, colClass, length(toPatch)))
      values <- data[[col]]

      # Quick/bad way, but will turn non-valid values into NA
      ##  storage.mode(values) <- colClass

      verbose && cat(verbose, "Non-parsed values:")
      verbose && str(verbose, values)

      bfr <- paste(values, collapse="\n")
      conT <- textConnection(bfr, open="r")
      on.exit({
        if (!is.null(conT)) close(conT)
      }, add=TRUE)

      # Try to read the values as the correct type.
      # (Could scan(), which has less overhead, be used here? /HB 2013-09-30)
      valuesT <- read.table(file=conT, quote="", colClasses=colClass, na.strings=na.strings, blank.lines.skip=FALSE)[[1L]]

      verbose && cat(verbose, "Parsed values:")
      verbose && str(verbose, valuesT)

      # Sanity check
      .stop_if_not(length(valuesT) == length(values))

      close(conT); conT <- NULL
      data[[col]] <- valuesT

      # Not needed anymore
      bfr <- values <- valuesT <- NULL
      verbose && exit(verbose)
    } # for (kk ...)
    verbose && exit(verbose)

    verbose && exit(verbose)
  } # if (trimQuotes)


  # Sanity check
  if (length(columns) > 0L) {
    if (ncol(data) != length(columns)) {
      throw("Number of read data columns does not match the number of column headers: ", ncol(data), " != ", length(columns))
    }
    colnames(data) <- columns
  }

  verbose && str(verbose, data)
  verbose && exit(verbose)

  if (debug) {
    attr(data, "fileHeader") <- hdr
    for (key in names(attributes)) {
      attr(data, key) <- attributes[[key]]
    }
  }

  verbose && exit(verbose)

  data
}) # readDataFrame()



setMethodS3("readColumns", "TabularTextFile", function(this, columns=seq_len(ncol(this)), colClasses=NULL, ..., check.names=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Arguments 'columns':
  maxNbrOfColumns <- ncol(this)
  if (is.null(columns)) {
    columnNames <- getColumnNames(this)
  } else if (is.numeric(columns)) {
    columns <- Arguments$getIndices(columns, max=maxNbrOfColumns)
    columnNames <- getColumnNames(this)
    if (!is.null(columnNames)) {
      columnNames <- columnNames[columns]
    }
  } else {
    columnNames <- Arguments$getCharacters(columns)
  }

  # Argument 'colClasses':
  if (!is.null(colClasses)) {
    names <- names(colClasses)
    colClasses <- Arguments$getCharacters(colClasses)
    names(colClasses) <- names
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Reading columns")
  verbose && cat(verbose, "Argument 'columns': ", hpaste(columns))
  verbose && cat(verbose, "Argument 'colClasses':")
  verbose && print(verbose, colClasses)

  verbose && cat(verbose, "Column names': ", hpaste(columnNames))
  # Setup column classes, iff missing
  if (is.null(colClasses)) {
    if (is.null(columnNames)) {
      colClasses[-columns] <- "NULL"
    } else {
      colClasses <- rep("character", times=length(columnNames))
      names(colClasses) <- sprintf("^%s$", columnNames)
    }
  } else {
    if (is.null(names(colClasses))) {
      names(colClasses) <- sprintf("^%s$", columnNames)
    }
  }

  verbose && cat(verbose, "Column classes:")
  verbose && print(verbose, colClasses)

  data <- readDataFrame(this, colClasses=colClasses, ..., verbose=less(verbose, 50))

  # Subset
  if (ncol(data) > 0L) {
    verbose && enter(verbose, "Subsetting columns")
    verbose && str(verbose, data)

    if (!is.null(columnNames)) {
      columns <- match(columnNames, names(data))
    }
    verbose && cat(verbose, "Columns to keep: ", hpaste(columns))

    # Sanity check
    columns <- Arguments$getIndices(columns, max=ncol(data))

    # Need to rearrange?
    if (any(diff(columns) != 1L) || ncol(data) > length(columns)) {
      data <- data[,columns,drop=FALSE]
      if (!check.names) {
        colnames(data) <- columnNames
      }
    }

    verbose && exit(verbose)
  }

  # Sanity check
  .stop_if_not(ncol(data) == length(columns))

  verbose && exit(verbose)

  data
}, protected=TRUE)



# AD HOC fix to speed up ll(), which calls dimension() on each object,
# which in turn calls dim() and dim() is really slow for this class,
# because it has to infer the number of rows by reading the complete
# file. The fix is to return NA for the number of rows if the file size
# is larger than 10MB unless nbrOfLines() has already been called.
# /HB 2008-07-22
setMethodS3("dimension", "TabularTextFile", function(this, ...) {
  c(nbrOfRows(this, fast=TRUE), nbrOfColumns(this, fast=TRUE))
}, private=TRUE)



###########################################################################/**
# @RdocMethod nbrOfRows
#
# @title "Counts the number of data rows"
#
# \description{
#  @get "title".  The count does not include the header rows or comments.
# }
#
# @synopsis
#
# \arguments{
#   \item{fast}{Argument passed to @seemethod "nbrOfLines".}
#   \item{...}{Optional arguments passed to @seemethod "getHeader".}
# }
#
# \value{
#   Returns a @character @vector.
# }
# @author
#
# \seealso{
#    The number of data rows is always less or equal to the number of lines
#    as returned by @seemethod "nbrOfLines".
#   Internally, @see "R.utils::countLines" is used.
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("nbrOfRows", "TabularTextFile", function(this, fast=FALSE, ...) {
  hdr <- getHeader(this, ...)

  nbrOfCommentRows <- length(hdr$comments)
  nbrOfRowsToSkip <- hdr$skip
  hasColumnNames <- (length(hdr$columns) > 0L)
  nbrOfNonDataRows <- nbrOfCommentRows + nbrOfRowsToSkip +
                      as.integer(hasColumnNames)

  n <- nbrOfLines(this, fast=fast)
  n <- n - nbrOfNonDataRows
  n <- as.integer(n)

  n
})



###########################################################################/**
# @RdocMethod nbrOfLines
#
# @title "Counts the number of lines in the data file"
#
# \description{
#  @get "title".  The count include header rows, comments and more.
# }
#
# @synopsis
#
# \arguments{
#   \item{fast}{If @TRUE, @NA is returned for large data files (>1Mb),
#     unless the number of lines has already been counted.}
#   \item{...}{Optional arguments passed to @see "R.utils::countLines".}
# }
#
# \value{
#   Returns a @character @vector.
# }
#
# @author
#
# \seealso{
#    To count the number of data rows is the data table,
#    use @seemethod "nbrOfRows".
#   Internally, @see "R.utils::countLines" is used.
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("nbrOfLines", "TabularTextFile", function(this, fast=FALSE, ...) {
  pathname <- getPathname(this)

  n <- this$.nbrOfLines
  mtime <- getLastModifiedOn(this)
  if (is.null(n) || is.na(mtime) || !identical(mtime, attr(n, "mtime"))) {
    if (fast) {
      if (getFileSize(this) < 10e6)
        fast <- FALSE
    }

    if (fast) {
      n <- NA_integer_
    } else {
      n <- countLines(pathname, ...)
      attr(n, "mtime") <- mtime
      this$.nbrOfLines <- n
    }
  }

  n
})




###########################################################################/**
# @RdocMethod readLines
#
# @title "Reads the lines of the data file as strings"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Optional arguments passed to @see "base::readLines".}
# }
#
# \value{
#   Returns a @character @vector.
# }
# @author
#
# \seealso{
#   @seemethod "readDataFrame".
#   @seeclass
# }
#
# @keyword IO
# @keyword programming
#*/###########################################################################
setMethodS3("readLines", "TabularTextFile", function(con, ...) {
  # To please R CMD check
  this <- con
  pathname <- getPathname(this)
  readLines(pathname, ...)
})
Any scripts or data that you put into this service are public.
R.filesets documentation built on May 29, 2024, 9:10 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
R.filesets
Easy Handling of and Access to Files Organized in Structured Directories

R/TabularTextFile.R
In R.filesets: Easy Handling of and Access to Files Organized in Structured Directories

Try the R.filesets package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

R.filesets Easy Handling of and Access to Files Organized in Structured Directories

R/TabularTextFile.R In R.filesets: Easy Handling of and Access to Files Organized in Structured Directories

Try the R.filesets package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

R.filesets
Easy Handling of and Access to Files Organized in Structured Directories

R/TabularTextFile.R
In R.filesets: Easy Handling of and Access to Files Organized in Structured Directories