R/rsk.R

Defines functions as.rsk unitFromStringRsk

Documented in as.rsk unitFromStringRsk

# vim:textwidth=80:expandtab:shiftwidth=4:softtabstop=4

#' Class to Store Rsk Data
#'
#' This class stores Ruskin data, from RBR (see reference 1).
#'
#' A [rsk-class] object may be read with [read.rsk()] or created with
#' [as.rsk()], but the former method is much preferred because it
#' retains the entirety of the information in the file.
#' Plots can be made with [plot,rsk-method()], while
#' [summary,rsk-method()] produces statistical summaries and `show`
#' produces overviews.   If atmospheric pressure has not been removed from the
#' data, [rskPatm()] may provide guidance as to its value,
#' but this is no equal to record-keeping at sea.
#'
#' @templateVar class rsk
#'
#' @templateVar dataExample {}
#'
#' @templateVar metadataExample {}
#'
#' @template slot_summary
#'
#' @template slot_put
#'
#' @template slot_get
#'
#' @references
#' 1. RBR website (https://www.rbr-global.com/products)
#'
#' @author Dan Kelley and Clark Richards
#'
#' @family classes provided by oce
#' @family things related to rsk data
setClass("rsk", contains="oce")

#' Sample Rsk Dataset
#'
#' A sample `rsk` object derived from a Concerto CTD manufactured by RBR Ltd.
#'
#' The data were obtained September 2015, off the west coast
#' of Greenland, by Matt Rutherford and Nicole Trenholm of the
#' Ocean Research Project, in collaboration with RBR and with the
#' NASA Oceans Melting Greenland project. The `rsk` object was
#' created with [read.rsk()] with `allTables=FALSE`, after which
#' some metadata were added and the samples were trimmed to
#' just the downcast portion.
#'
#' @name rsk
#'
#' @docType data
#'
#' @references `https://rbr-global.com/`
#'
#' @examples
#' library(oce)
#' data(rsk)
#' # The object doesn't "know" it is CTD until told so
#' plot(rsk)
#' plot(as.ctd(rsk))
#'
#' @family datasets provided with oce
#' @family things related to rsk data
NULL

setMethod(f="initialize",
    signature="rsk",
    definition=function(.Object, time, pressure, temperature, filename="", ...) {
        .Object <- callNextMethod(.Object, ...)
        if (!missing(time))
            .Object@data$time <- time
        if (!missing(pressure))
            .Object@data$pressure <- pressure
        if (!missing(temperature))
            .Object@data$temperature <- temperature
        .Object@metadata$filename <- filename
        .Object@metadata$model <- NA
        .Object@metadata$pressureType <- "absolute"
        .Object@metadata$pressureAtmospheric <- 10.1325
        .Object@processingLog$time <- presentTime()
        .Object@processingLog$value <- "create 'rsk' object"
        return(.Object)
    })

#' Summarize a Rsk Object
#'
#' Summarizes some of the data in a [rsk-class] object, presenting such information
#' as the station name, sampling location, data ranges, etc.
#'
#' @param object An [rsk-class] object.
#'
#' @param ... Further arguments passed to or from other methods.
#'
#' @seealso The documentation for [rsk-class] explains the structure
#' of CTD objects, and also outlines the other functions dealing with them.
#'
#' @examples
#' library(oce)
#' data(rsk)
#' summary(rsk)
#'
#' @author Dan Kelley
#'
#' @family things related to rsk data
setMethod(f="summary",
    signature="rsk",
    definition=function(object, ...) {
        m <- object@metadata
        mnames <- names(m)
        cat("rsk summary\n-----------\n", ...)
        cat("* Instrument:    model ", m$model,
            " serial number ", m$serialNumber, "\n", sep="")
        if ("pressureAtmospheric" %in% mnames)
            cat(paste("* Atm. press.:   ", m$pressureAtmospheric, "\n", sep=""))
        if ("pressureType" %in% mnames)
            cat(paste("* Pressure type: ", m$pressureType, "\n", sep=""))
        cat(paste("* Source:        `", m$filename, "`\n", sep=""))
        invisible(callNextMethod()) # summary
    })

#' Extract Something From a Rsk Object
#'
#' @param x an [rsk-class] object.
#'
#' @section Details of the Specialized Method:
#'
#' * If `i` is `"?"`, then the return value is a list
#' containing four items, each of which is a character vector
#' holding the names of things that can be accessed with `[[`.
#' The `data` and `metadata` items hold the names of
#' entries in the object's data and metadata
#' slots, respectively. The `dataDerived`
#' and `metadataDerived` items are each NULL, because
#' no derived values are defined by [rsk-class] objects.
#'
#' @template sub_subTemplate
#'
#' @author Dan Kelley
#'
#' @family things related to rsk data
setMethod(f="[[",
    signature(x="rsk", i="ANY", j="ANY"),
    definition=function(x, i, j, ...) {
        if (i == "?") {
            return(list(metadata=sort(names(x@metadata)),
                metadataDerived=NULL,
                data=sort(names(x@data)),
                dataDerived=NULL))
        } else {
            callNextMethod()         # [[
        }
    })

#' Replace Parts of a Rsk Object
#'
#' @param x an [rsk-class] object.
#'
#' @template sub_subsetTemplate
#'
#' @family things related to rsk data
setMethod(f="[[<-",
    signature(x="rsk", i="ANY", j="ANY"),
    definition=function(x, i, j, ..., value) {
        callNextMethod(x=x, i=i, j=j, ...=..., value=value) # [[<-
    })



#' Subset a Rsk Object
#'
#' Subset a rsk object.  This function is somewhat analogous to
#' [subset.data.frame()], but subsetting is only permitted by time.
#'
#' @param x an [rsk-class] object.
#'
#' @param subset a condition to be applied to the `data` portion of `x`.
#' See \dQuote{Details}.
#'
#' @param \dots ignored.
#'
#' @return
#' An [rsk-class] object.
#'
#' @examples
#' library(oce)
#' data(rsk)
#' plot(rsk)
#' plot(subset(rsk, time < mean(range(rsk[['time']]))))
#'
#' @author Dan Kelley
#'
#' @family things related to rsk data
#' @family functions that subset oce objects
setMethod(f="subset",
    signature="rsk",
    definition=function(x, subset, ...) {
        res <- new("rsk") # start afresh in case x@data is a data.frame
        res@metadata <- x@metadata
        res@processingLog <- x@processingLog
        # message("NOTE: debugging output coming up!")
        for (i in seq_along(x@data)) {
            #  message("i: ", i)
            #  str(x@data)
            #  str(x@data$time[1])
            #  print(x@data$time[1])
            #  print(x@data$time[2])
            #  print(is.language(substitute(subset)))
            #  str(substitute(subset))
            #  Prior to 2015-01-15 the next line was
            #    r <- eval(substitute(subset), x@data)#, parent.frame())
            # But that failed when calling subset from within other functions; see
            # github (FIXME: fill in issue link, when issue is submitted).
            #     http://r.789695.n4.nabble.com/getting-environment-from-quot-top-quot-promise-td4685138.html
            # for a question regarding environments. I used to have parent.frame() here, and
            # in other "subset" definitions, but my tests are suggesting parent.frame(2)
            # will work more generally: (a) within flat code and (b) within a function
            # that is passed items to go in the subset.
            r <- eval(substitute(expr=subset, env=environment()), envir=x@data, enclos=parent.frame(2))
            #  str(r)
            r <- r & !is.na(r)
            res@data[[i]] <- x@data[[i]][r]
        }
        names(res@data) <- names(x@data)
        subsetString <- paste(deparse(substitute(expr=subset, env=environment())), collapse=" ")
        res@processingLog <- processingLogAppend(res@processingLog,
            paste("subset.rsk(x, subset=", subsetString, ")", sep=""))
        res
    })

#' Infer Rsk units from a vector of strings
#'
#' This is used by [read.rsk()] to infer the units of data, based
#' on strings stored in `.rsk` files. Lacking a definitive guide
#' to the format of these file, this function was based on visual inspection
#' of the data contained within a few sample files; unusual sensors may
#' not be handled properly.
#'
#' @param s Vector of character strings, holding the `units` entry in the
#' `channels` table of the `.rsk` database.
#'
#' @return List of unit lists.
#'
#' @family functions that interpret variable names and units from headers
unitFromStringRsk <- function(s)
{
    # Note: some of these use e.g. [lL] because the Ruskin GUI uses upper case,
    # whereas at least one file has used lower case. Another odd case is
    # "dbar" vs "dBar", both of which have been seen in files. Still, it
    # was decided not to use ignore.case=TRUE in the grep() commands,
    # because that seems to overly blunt the tool.
    #
    # Here's how to figure out special characters:
    # print(s) # nolint
    # [1] "µMol/m²/s"
    # Browse[1]> Encoding(s)<-"bytes" # nolint
    # Browse[2]> print(s) # nolint
    # [1] "\\xc2\\xb5Mol/m\\xc2\\xb2/s"
    #
    # 2022-06-28 upcoming version of R will require elimination of \x sequences; see
    # https://developer.r-project.org/Blog/public/2022/06/27/why-to-avoid-%5Cx-in-regular-expressions/index.html
    # Below is a table of code, unicode, unicode-in-R, and old-method-in-R.
    #    µ=U+03BC, new=\u03bc, old=\xc2\xb5
    #    °=U+00B0, new=\u00b0, old=\xB0
    #    ²=U+00B2, new=\u00b2, old=\xc2\xb2
    if (grepl("mg/[lL]", s)) {
        list(unit=expression(mg/l), scale="")
    } else if (grepl("m[lL]/[lL]", s)) {
        list(unit=expression(ml/l), scale="")
    # 2022-06-28 else if (grepl("((u)|(\xc2\xb5))[mM]ol/[lL]", s))
    } else if (grepl("((u)|(\u03bc))[mM]ol/[lL]", s)) {
        list(unit=expression(mu*mol/l), scale="")
    # 2022-06-28 else if (grepl("((u)|(\xc2\xb5))g/[lL]", s))
    } else if (grepl("((u)|(\u03bc))g/[lL]", s)) {
        list(unit=expression(mu*g/l), scale="")
    } else if (grepl("mS/cm", s)) {
        list(unit=expression(mS/cm), scale="")
    # 2022-06-28 else if (grepl("((u)|(\xc2\xb5))S/cm", s))
    } else if (grepl("((u)|(\u03bc))S/cm", s)) {
        list(unit=expression(mu*S/cm), scale="")
    } else if (grepl("d[bB]ar", s)) {
        list(unit=expression(dbar), scale="")
    } else if (grepl("%", s)) {
        list(unit=expression(percent), scale="")
    } else if (grepl("pH_units", s)) {
        list(unit=expression(), scale="")
    } else if (grepl("NTU", s)) {
        list(unit=expression(NTU), scale="")
    # 2022-06-28 else if (grepl("\xB0", s))
    } else if (grepl("\u00b0", s)) {
        list(unit=expression(degree*C), scale="ITS-90") # guessing on scale
    # 2022-06-28 else if (grepl("\\xc2\\xb5Mol/m\\xc2\\xb2/s", s)) # µMol/m²/s
    } else if (grepl("\u03bcMol/m\u00b2/s", s)) {
        list(unit=expression(mu*mol/m^2/s), scale="")
    } else if (is.na(s)) {
        list(unit=expression(), scale="?")
    } else {
        warning("\"", s, "\" is not in the list of known .rsk units", sep="")
        list(unit=as.expression(s), scale="")
    }
}

#' Coerce Data Into a Rsk Object
#'
#' Create a rsk object.
#'
#' The contents of `columns` are be copied into the `data` slot
#' of the returned object directly, so it is critical that the names and units
#' correspond to those expected by other code dealing with
#' [rsk-class] objects. If there is a conductivity, it must be called
#' `conductivity`, and it must be in units of mS/cm. If there is a
#' temperature, it must be called `temperature`, and it must be an in-situ
#' value recorded in ITS-90 units.  And if there is a pressure, it must be
#' *absolute* pressure (sea pressure plus atmospheric pressure) and it must
#' be named `pressure`. No checks are made within `as.rsk` on any of
#' these rules, but if they are broken, you may expect problems with any further
#' processing.
#'
#' @param time a vector of times for the data.
#'
#' @param columns a list or data frame containing the measurements at the indicated
#' times; see \dQuote{Details}.
#'
#' @param filename optional name of file containing the data.
#'
#' @param instrumentType type of instrument.
#'
#' @param serialNumber serial number for instrument.
#'
#' @param model instrument model type, e.g. `"RBRduo"`.
#'
#' @param sampleInterval sampling interval. If given as `NA`, then this is
#' estimated as the median difference in times.
#'
#' @param debug a flag that can be set to `TRUE` to turn on debugging.
#'
#' @return An [rsk-class] object.
#'
#' @author Dan Kelley
#'
#' @family things related to rsk data
as.rsk <- function(time, columns,
    filename="", instrumentType="rbr", serialNumber="", model="",
    sampleInterval=NA,
    debug=getOption("oceDebug"))
{
    debug <- min(debug, 1)
    oceDebug(debug, "as.rsk(..., filename=\"", filename, "\",
        serialNumber=\"", serialNumber, "\")\n", sep="", unindent=1)
    if (inherits(time, "oce"))
        stop("cannot coerce from general oce object to rsk; submit an issue if you need this")
    if (missing(time))
        stop("must give time")
    if (!inherits(time, "POSIXt"))
        stop("'time' must be POSIXt")
    time <- as.POSIXct(time)
    res <- new("rsk")
    res@metadata$instrumentType <- instrumentType
    if (nchar(model))
        res@metadata$model <-model
    res@metadata$serialNumber <- serialNumber
    res@metadata$filename <- filename
    res@metadata$sampleInterval <- if (is.na(sampleInterval)) {
        median(diff(as.numeric(time)), na.rm=TRUE)
    } else {
        sampleInterval
    }
    processingLog <- paste(deparse(match.call()), sep="", collapse="")
    res@processingLog <- processingLogAppend(res@processingLog, processingLog)
    res@data <- list(time=time)
    if (!missing(columns)) {
        for (item in names(columns)) {
            res@data[[item]] <- columns[[item]]
        }
    }
    oceDebug(debug, "} # as.rsk()\n", sep="", unindent=1)
    res
}


#' Plot a rsk Object
#'
#' Rsk data may be in many forms, and it is not easy to devise a general plotting
#' strategy for all of them. The present function is quite crude, on the
#' assumption that users will understand their own datasets, and that they can
#' devise plots that are best-suited to their applications.  Sometimes, the
#' sensible scheme is to coerce the object into another form, e.g. using
#' `plot(as.ctd(rsk))` if the object contains CTD-like data.  Other times,
#' users should extract data from the `rsk` object and construct plots
#' themselves. The idea is to use the present function mainly to get an overview,
#' and for that reason, the default plot type (set by `which`) is a set of
#' time-series plots, because the one thing that is definitely known about
#' `rsk` objects is that they contain a `time` vector in their
#' `data` slot.
#'
#' @details Plots produced are time series plots of the data in the
#'     object. The default, `which="timeseries"` plots all data
#'     fields, and over-rides any other specification. Specific fields
#'     can be plotted by naming the field,
#'     e.g. `which="temperature"` to plot a time series of just
#'     the temperature field.
#'
#' @param x an [rsk-class] object.
#'
#' @param which character indicating desired plot types.  These are
#'     graphed in panels running down from the top of the page.  See
#'     \dQuote{Details} for the meanings of various values of
#'     `which`.
#'
#' @param tlim optional limits for time axis.  If not provided, the value will be
#' inferred from the data.
#'
#' @param ylim optional limits for the y axis.  If not provided, the
#'     value will be inferred from the data.  (It is helpful to
#'     specify this, if the auto-scaled value will be inappropriate,
#'     e.g. if more lines are to be added later). Note that this is
#'     ignored, unless `length(which) == 1` and `which`
#'     corresponds to one of the data fields. If a multipanel plot of
#'     a specific subset of the data fields is desired with
#'     `ylim` control, it should be done panel by panel (see
#'     Examples).
#'
#' @param xlab optional label for x axis.
#'
#' @param ylab optional label for y axis.
#'
#' @param tformat optional argument passed to [oce.plot.ts()], for plot
#' types that call that function.  (See [strptime()] for the format
#' used.)
#'
#' @param drawTimeRange boolean that applies to panels with time as the horizontal
#' axis, indicating whether to draw the time range in the top-left margin of the
#' plot.
#'
#' @param abbreviateTimeRange boolean that applies to panels with time as the
#' horizontal axis, indicating whether to abbreviate the second time in the time
#' range (e.g. skipping the year, month, day, etc. if it's the same as the start
#' time).
#'
#' @param useSmoothScatter a boolean to cause [smoothScatter()] to be
#' used for profile plots, instead of [plot()].
#'
#' @param mgp 3-element numerical vector to use for [`par`]`("mgp")`, and
#' also for `par(mar)`, computed from this.  The default is tighter than the
#' R default, in order to use more space for the data and less for the axes.
#'
#' @param mar value to be used with [`par`]`("mar")`.
#'
#' @param main main title for plot, used just on the top panel, if there are several panels.
#'
#' @param debug a flag that turns on debugging, if it exceeds 0.
#'
#' @param ... optional arguments passed to plotting functions.
#'
#' @examples
#' library(oce)
#' data(rsk)
#' # 1. default timeseries plot of all data fields
#' plot(rsk)
#' # 2. plot in ctd format
#' plot(as.ctd(rsk))
#'
#' @seealso
#' The documentation for [rsk-class] explains the structure of
#' `rsk` objects, and also outlines the other functions dealing with them.
#'
#' @author Dan Kelley and Clark Richards
#'
#' @family functions that plot oce data
#' @family things related to rsk data
#'
#' @aliases plot.rsk
setMethod(f="plot",
    signature=signature("rsk"),
    definition=function(x, which="timeseries",
        tlim, ylim,
        xlab, ylab,
        tformat,
        drawTimeRange=getOption("oceDrawTimeRange"),
        abbreviateTimeRange=getOption("oceAbbreviateTimeRange"),
        useSmoothScatter=FALSE,
        mgp=getOption("oceMgp"),
        mar=c(mgp[1]+1.5, mgp[1]+1.5, 1.5, 1.5),
        main="",
        debug=getOption("oceDebug"),
        ...)
    {
        oceDebug(debug, "plot.rsk(..., which=", which, ", ...) {\n", unindent=1)
        dotsNames <- names(list(...))
        # FIXME: In the below, we could be more clever for single-panel plots
        # but it may be better to get users out of the habit of supplying xlim
        # etc (which will yield errors in plot.lm(), for example).
        if ("xlim" %in% dotsNames)
            stop("in plot.rsk() : 'xlim' not allowed; use tlim", call.=FALSE)
        if (any(which=="timeseries"))
            which <- "timeseries" # "timeseries" overrides any others
        lw <- length(which)
        if (lw == 1 && which=="timeseries") {
            opar <- par(no.readonly = TRUE)
            on.exit(par(opar))
            names <- names(x@data)
            if (!"time" %in% names)
                stop("plot.rsk() cannot plot timeseries, since no \"time\" data", call.=FALSE)
            names <- names[names != "time" & names != "tstamp"]
            par(mfrow=c(length(names), 1))
            for (name in names) {
                if (!is.null(x[["units"]])) {
                    unit <- x[["units"]][[name]]$unit
                    tmp <- c(name, '~"["*', as.character(unit), '*"]"')
                    label <- bquote(.(parse(text=paste0(tmp, collapse=""))))
                } else {
                    label <- name
                }
                oce.plot.ts(x[["time"]], x[[name]], ylab=label, ...)
            }
        } else {
            # individual panels
            # Trim out plots that we cannot do.
            names <- names(x@data)
            names <- names[- (names=="time")]
            nw <- length(which)
            if (missing(main)) {
                main <- rep("", length.out=nw)
            }
            main <- rep(main, length.out=nw)
            oceDebug(debug, "after nickname-substitution, which=c(", paste(which, collapse=","), ")\n")
            for (w in 1:nw) {
                oceDebug(debug, "which[", w, "]=", which[w], "\n")
                haveField <- (which[w] %in% names) && any(is.finite(x[[which[w]]]))
                if (haveField) {
                    if (!is.null(x[["units"]])) {
                        unit <- x[["units"]][[which[w]]]$unit
                        tmp <- c(which[w], '~"["*', as.character(unit), '*"]"')
                        label <- bquote(.(parse(text=paste0(tmp, collapse=""))))
                    } else {
                        label <- which[w]
                    }
                    oce.plot.ts(x@data$time, x[[which[w]]],
                        xlab=if (!missing(xlab)) xlab else "",
                        ylab=if (missing(ylab)) label else  ylab,
                        type="l",
                        xlim=if (missing(tlim)) range(x@data$time, na.rm=TRUE) else tlim,
                        ylim=if (missing(ylim)) range(x[[which[w]]], na.rm=TRUE) else ylim,
                        tformat=tformat,
                        drawTimeRange=drawTimeRange,
                        mgp=mgp, mar=mar, main=main[w], ...)
                    drawTimeRange <- FALSE    # only first time panel gets time indication
                    axis(2)
                } else {
                    stop("Unrecognized 'which'; not \"timeseries\" or a data name")
                }
            }
        }
        oceDebug(debug, "} # plot.rsk()\n", unindent=1)
        invisible(NULL)
    })



#' Read a Rsk file
#'
#' Read an RBR rsk or txt file, e.g. as produced by an RBR logger, producing an
#' object of class `rsk`.
#'
#' This can read files produced by several RBR instruments.  At the moment, five
#' styles are understood: (1) text file produced as an export of an RBR `hex`
#' or `rsk` file; (2) text file with columns for temperature and pressure
#' (with sampling times indicated in the header); (3) text file with four columns,
#' in which the date the time of day are given in the first two columns, followed
#' by the temperature, and pressure; (4) text file with five columns, in which
#' depth in the water column is given after the pressure; (5) an SQLite-based
#' database format. The first four options are provided mainly for historical
#' reasons, since RBR instruments at the date of writing commonly use the SQLite
#' format, though the first option is common for all instruments that produce a
#' `hex` file that can be read using Ruskin.
#' Options 2-4 are mostly obsolete, and will be removed from future versions.
#'
#' **A note on location information.**
#' It is possible to couple RBR CTD devices with smart phones or tablets
#' (see RBR-global, 2020), with the location data from the latter being stored
#' in the output `.rsk` file. The format does not seem to be documented by RBR,
#' but `read.rsk()` attempts to infer location nevertheless.
#' The procedure involves comparing the `tstamp` (time-stamp)
#' field from the hydrographic part of the dataset (which is in a database
#' table named `data`) with the `tstamp` field in the geographical part of the
#' dataset (in a table named `geodata`). (The `geodata` entries are filtered to
#' those for which the `origin` field equals `"auto"`. This seems to align
#' with times shown for "LOCATION" data in RBR-provided viewing software.)
#' The connection between the two fields is done with [approx()], after
#' adding `tzOffsetLocation` (with units converted appropriately) to the
#' time inferred from `geodata$tstamp` field, to account for the fact that phones and tablets
#' may be set to local time. If the procedure succeeds, then `longitude` and
#' `latitude` are inserted into the `metadata` slot of the return value, in the
#' form of vectors with the same length as `pressure` in the `data` slot.
#'
#' **A note on conductivity.**
#' RBR devices record conductivity in mS/cm, and it
#' is this value that is stored in the object returned by `read.rsk`. This can
#' be converted to conductivity ratio (which is what many other instruments report)
#' by dividing by 42.914 (see Culkin and Smith, 1980) which will be necessary in
#' any seawater-related function that takes conductivity ratio as an argument (see
#' \dQuote{Examples}).
#'
#' **A note on pressure.**
#' RBR devices tend to record absolute pressure (i.e.
#' sea pressure plus atmospheric pressure), unlike most oceanographic instruments
#' that record sea pressure (or an estimate thereof).  The handling of pressure
#' is controlled with the `patm` argument, for which there are three
#' possibilities.  (1) If `patm` is `FALSE` (the default), then
#' pressure read from the data file is stored in the `data` slot of return
#' value, and the `metadata` item `pressureType` is set to the string
#' `"absolute"`.  (2) If `patm` is `TRUE`, then an estimate of
#' atmospheric pressure is subtracted from the raw data. For data files in the
#' SQLite format (i.e.  `*.rsk` files), this estimate will be the value read
#' from the file, or the ``standard atmosphere'' value 10.1325 dbar, if the file
#' lacks this information.  (3) If `patm` is a numerical value (or list of
#' values, one for each sampling time), then `patm` is subtracted from the
#' raw data.  In cases 2 and 3, an additional column named
#' `pressureOriginal` is added to the `data` slot to store the value
#' contained in the data file, and `pressureType` is set to a string
#' starting with `"sea"`.  See [as.ctd()] for details of how this
#' setup facilitates the conversion of [rsk-class] objects to
#' [ctd-class] objects.
#'
#' @param file a connection or a character string giving the name of the RSK file to
#' load. Note that `file` must be a character string, because connections are
#' not used in that case, which is instead handled with database calls.
#'
#' @param from indication of the first datum to read.  This can a positive integer
#' to indicate sequence number, the POSIX time of the first datum, or a character
#' string that can be converted to a POSIX time.  (For POSIX times, be careful
#' about the `tz` argument.)
#'
#' @param to an indication of the last datum to be read, in the same format as
#' `from`.  If `to` is missing, data will be read to the end of the file.
#'
#' @param by an indication of the stride length to use while walking through the
#' file.  If this is an integer, then `by-1` samples are skipped between each
#' pair of samples that is read.  If this is a string representing a time interval,
#' in colon-separated format (HH:MM:SS or MM:SS), then this interval is divided by
#' the sampling interval, to get the stride length.
#'
#' @param type optional file type, presently can be `rsk` or `txt` (for a
#' text export of an RBR rsk or hex file). If this argument is not provided, an
#' attempt will be made to infer the type from the file name and contents.
#'
#' @template encodingIgnoredTemplate
#'
#' @param tz the timezone assumed for the time values stored in the
#' data file.  Unless the user has set an alternative value in the
#' `~/.Rprofile` file, the default will be `"UTC"`; see the
#' \dQuote{Altering oce Defaults} vignette for more on the use
#' of the `~/.Rprofile` file.
#'
#' @param tzOffsetLocation offset, in hours, between the CTD clock and
#' the clock in the controlling computer/tablet/phone (if one was used during
#' the sampling). This offset is required to relate location information from the
#' controller to hydrographic information from the CTD, using timestamps as an
#' index (see "A note on location information" in \dQuote{Details}).
#' If the user supplies a value for `tzOffsetLocation`, then that is used.
#' If not, an attempt is made to infer it from an item named `UTCdelta`
#' that might be present within a table named `epochs` in the file. If no
#' value can be inferred from either of these two methods, then
#' `tzOffsetLocation` is set to zero.
#'
#' @param patm controls the handling of atmospheric pressure, an important issue
#' for RBR instruments that record absolute pressure; see \dQuote{Details}.
#'
#' @param allTables logical value, TRUE by default, indicating whether to save
#' all the non-empty tables in the database (pruned of `blob` columns) in the
#' `metadata` slot of the returned object. This may be useful for detailed
#' analysis.
#'
#' @param processingLog if provided, the action item to be stored in the log.
#' This is typically only provided for internal calls; the default that it provides
#' is better for normal calls by a user.
#'
#' @param debug a flag that can be set to `TRUE` to turn on debugging.
#'
#' @return An [rsk-class] object.
#'
#' @seealso
#' The documentation for [rsk-class] explains the structure of
#' `rsk` objects, and also outlines other functions dealing with them.  Since
#' RBR has a wide variety of instruments, `rsk` datasets can be quite general,
#' and it is common to coerce `rsk` objects to other forms for specialized
#' work, e.g. [as.ctd()] can be used to create CTD object, so that the
#' generic plot obeys the CTD format.
#'
#' @references
#' Culkin, F., and Norman D. Smith, 1980. Determination of the concentration of
#' potassium chloride solution having the same electrical conductivity, at 15 C and
#' infinite frequency, as standard seawater of salinity 35.0000 ppt (Chlorinity
#' 19.37394 ppt). *IEEE Journal of Oceanic Engineering*, **5**, pp 22-23.
#'
#' RBR-global.com, 2020. “Ruskin User Guide.” RBR, August 18, 2020.
#' Revision RBR#0006105revH.
#'
#' @author Dan Kelley and Clark Richards
#'
#' @family things related to rsk data
read.rsk <- function(file, from=1, to, by=1, type, encoding=NA,
    tz=getOption("oceTz", default="UTC"), tzOffsetLocation,
    patm=FALSE, allTables=TRUE, processingLog, debug=getOption("oceDebug"))
{
    if (missing(file))
        stop("must supply 'file'")
    if (is.character(file)) {
        if (!file.exists(file))
            stop("cannot find file '", file, "'")
        if (0L == file.info(file)$size)
            stop("empty file '", file, "'")
    }
    from <- from[1]
    debug <- max(0, min(debug, 2))
    oceDebug(debug, "read.rsk(file=\"", file, "\", from=", format(from),
        ", to=", if (missing(to)) "(not given)" else format(to),
        ", by=", by,
        ", type=", if (missing(type)) "(missing)" else type,
        ", tz=\"", tz, "\", ...) {\n", sep="", unindent=1)
    filename <- file
    if (is.character(file)) {
        if (grepl(".rsk$", file, ignore.case=TRUE)) {
            type <- "rsk"
            file <- file(file, "r")
        } else if (grepl(".txt$", file, ignore.case=TRUE)) {
            type <- "txt"
            file <- file(file, "r", encoding=encoding)
        } else {
            file <- file(file, "r") # FIXME: can this happen?
        }
        on.exit(close(file))
    }
    if (!inherits(file, "connection"))
        stop("'file' must be a character string or connection")
    if (!isOpen(file)) {
        open(file, "r", encoding=encoding) # ignored if rsk
        on.exit(close(file))
    }
    from.keep <- from
    # measurement.deltat <- 0
    if (is.numeric(from) && from < 1)
        stop("from cannot be an integer less than 1")
    if (!missing(to) && is.numeric(to) && to < 1)
        stop("to cannot be an integer less than 1")
    if (!missing(to))
        to.keep <- to
    header <- c()
    measurementStart <-measurementEnd <- measurementDeltat <- NULL
    pressureAtmospheric <- NA
    res <- new("rsk", filename=filename)
    if (!missing(type) && type == "rsk") {
        if (!requireNamespace("RSQLite", quietly=TRUE))
            stop('must install.packages("RSQLite") to read rsk data')
        if (!requireNamespace("DBI", quietly=TRUE))
            stop('must install.packages("DBI") to read rsk data')
        con <- DBI::dbConnect(RSQLite::SQLite(), dbname=filename)
        # Advanced users might want to see what tables are in this file.
        tableNames <- RSQLite::dbListTables(con)
        oceDebug(debug, "This file had the following tables: ",
            paste(sort(tableNames), collapse=", "), "\n")
        if (allTables) {
            oceDebug(debug, "Reading tables and storing in metadata@ slot\n")
            # We skip a possible table named 'blob' (although I am not sure I
            # understood CR correctly on this; he might have been referring to
            # data of the 'blob' sqlite3 type).
            for (name in tableNames) {
                if (!identical(name, "blob")) { # FIXME: did CR mean table name or item name?
                    tmp <- RSQLite::dbReadTable(con, name)
                    #if ("blob" %in% names(tmp)) {
                    #    message("Skipping ", name, "$blob")
                    #    tmp["blob"] <- NULL
                    #}
                    if (nrow(tmp) > 0L) {
                        for (tmpname in names(tmp)) {
                            if (inherits(tmp[[tmpname]], "blob")) {
                                oceDebug(debug, "    Not storing @metadata$", name, "$", tmpname, " because it is a blob\n", sep="")
                                tmp$tmpname <- NULL
                            }
                        }
                        res@metadata[[name]] <- tmp
                        oceDebug(debug, "    Storing @metadata$", name, "\n", sep="")
                    } else {
                        oceDebug(debug, "    Not storing @metadata$", name, " because it has no data\n", sep="")
                    }
                }
            }
        }
        # rsk database-schema version number
        dbInfo <- RSQLite::dbReadTable(con, "dbInfo")
        # Ruskin software version number
        rskv <- dbInfo[1, 1]
        rskVersion <- as.numeric(strsplit(gsub(".[a-z].*$",
            "",
            gsub("^.*- *", "", rskv)), "\\.")[[1]])
        oceDebug(debug, "RSK software version ", paste(rskVersion, collapse="."), "\n")
        if (RSQLite::dbExistsTable(con, "appSettings")) {
            appSettings <- RSQLite::dbReadTable(con, "appSettings")
            rv <- appSettings[1, 2]
            ruskinVersion <- as.numeric(strsplit(gsub(".[a-z].*$", "", gsub("^.*- *", "", rv)), "\\.")[[1]])
        } else {
            ruskinVersion <- "mobile"
        }
        # Get geographic data to enable location matching of the data.
        geodata <- NULL
        if ("geodata" %in% tableNames) {
            geodata <- RSQLite::dbReadTable(con, "geodata")
            oceDebug(debug, "this file contains ", length(geodata$longitude), " location data\n")
            # We need to know the offset between the CTD clock and the
            # controller clock, so that we can later match up lon-lat from the
            # controller with hydrography from the CTD.  If the user provided
            # the tzOffsetLocation argument, use it.  Otherwise, try to infer
            # the value from the 'UTCdelta' field in the 'epochs' table.
            if (missing(tzOffsetLocation)) {
                tzOffsetLocation <- 0.0
                if ("epochs" %in% tableNames) {
                    epochs <- RSQLite::dbReadTable(con, "epochs")
                    if ("UTCdelta" %in% names(epochs)) {
                        tzOffsetLocation <- epochs$UTCdelta / 3600.0 / 1000.0
                        oceDebug(debug, "inferred tzOffsetLocation=", tzOffsetLocation, "\n")
                    } else {
                        oceDebug(debug, "tzOffsetLocation can't computed because 'epochs' table lacks 'UTCdelta'\n")
                    }
                } else {
                    oceDebug(debug, "tzOffsetLocation can't computed because there is no 'epochs' table\n")
                }
            }
            oceDebug(debug, "using specified or computed tzOffsetLocation=", tzOffsetLocation, "\n")
        } else {
            oceDebug(debug, "this file does not contain location data\n")
        }

        # Get atmospheric pressure
        pressureAtmospheric <- 10.1325 # FIXME: what is best default?
        oceDebug(debug, "first, guess pressureAtmospheric=", pressureAtmospheric, "\n")
        warn <- FALSE
        try({
            # need to wrap in try() because this can fail
            deriveDepth <- RSQLite::dbReadTable(con, "deriveDepth")
            pressureAtmospheric <- deriveDepth$atmosphericPressure
            warn <- TRUE
        }, silent=TRUE)
        if (warn) {
            warning("non-standard pressureAtmospheric value: ", pressureAtmospheric)
        }
        # some cases can have an empty pressureAtmospheric
        if (length(pressureAtmospheric) == 0) {
            warning("empty pressureAtmospheric value in rsk file. Setting to default value of 10.1325")
            pressureAtmospheric <- 10.1325
        }
        # message("NEW: pressureAtmospheric:", pressureAtmospheric)
        oceDebug(debug, "after studying the RSK file, now have pressureAtmospheric=", pressureAtmospheric, "\n")
        # From notes in comments above, it seems necessary to order by
        # timestamp (tstamp). Ordering does not seem to be an option for
        # dbReadTable(), so we use dbFetch().
        #
        # Get time stamp. Note the trick of making it floating-point
        # to avoid the problem that R lacks 64 bit integers.
        fields <- DBI::dbListFields(con, "data")
        fields <- fields[!grepl("tstamp", fields)]
        sql_fields <- if (packageVersion("RSQLite") < "2.0") "1.0*tstamp AS tstamp" else "tstamp"
        sql_fields <- paste(c(sql_fields, fields), collapse=",")
        sql_fields <- paste("SELECT", sql_fields, "FROM data")
        # When to and from are numeric and not equal to 1 we have to query the table
        # and then sort the times so that the limits are meaningful.  This code
        # does that only when required and will be slower than when from and to
        # are dates or character.
        time <- NA
        if (!missing(to)) {
            to <- to[1]
            if (inherits(to, "POSIXt")) {
                to <- as.character(as.numeric(to)*1000)
            } else if (inherits(to, "character")) {
                to <- as.character(as.numeric(as.POSIXct(to, tz=tz))*1000)
            } else if (is.numeric(to)) {
                qres <- DBI::dbSendQuery(con,
                    if (packageVersion("RSQLite") < "2.0") {
                        "select 1.0*tstamp from data order by tstamp;"
                    } else {
                        "select tstamp from data order by tstamp;"
                    })
                t1000 <- DBI::dbFetch(qres, n=-1)[[1]]
                RSQLite::dbClearResult(qres)
                time <- numberAsPOSIXct(as.numeric(t1000) / 1000, type="unix", tz=tz)
                oceDebug(debug, "at rsk.R bookmark A 1 of 3: time[1] is ", format(time[1], "%Y-%m-%d %H:%M:%S (UTC%z, i.e. %Z)"), "\n")
            }
        }
        if (is.numeric(from) && from != 1 && all(is.na(time))) {
            qres <- DBI::dbSendQuery(con,
                if (packageVersion("RSQLite") < "2.0") {
                    "select 1.0*tstamp from data order by tstamp;"
                } else {
                    "select tstamp from data order by tstamp;"
                })
            t1000 <- DBI::dbFetch(qres, n=-1)[[1]]
            RSQLite::dbClearResult(qres)
            time <- numberAsPOSIXct(as.numeric(t1000) / 1000, type="unix", tz=tz)
            oceDebug(debug, "at rsk.R bookmark A 2 of 3: time[1] is ", format(time[1], "%Y-%m-%d %H:%M:%S (UTC%z, i.e. %Z)"), "\n")
        }
        # format to and from that match tstamp from the rsk file
        if (inherits(from, "POSIXt")) {
            from <- as.character(as.numeric(from)*1000)
        } else if (inherits(from, "character")) {
            from <- as.character(as.numeric(as.POSIXct(from, tz=tz))*1000)
        }
        if (!all(is.na(time))) {
            if (is.numeric(from))
                from <- t1000[from]
            if (missing(to)) { # FIXME: can this happen, given earlier code?
                to <- tail(t1000, 1)
            } else if (is.numeric(to)) {
                to <- t1000[to]
            }
        }
        # Generate the sql that contains the time filters
        qresSQL <- sql_fields
        if (missing(to)) {
            if (is.numeric(from)) {
                qresSQL <- paste(qresSQL, ";")
            } else {
                qresSQL <- paste(qresSQL, "where tstamp >=",  from, ";")
            }
        } else {
            if (missing(to)) {
                qresSQL <- paste(qresSQL, "where tstamp >=",  from, ";")
            } else if (from==1) {
                qresSQL <- paste(qresSQL, "where tstamp <=",  to, ";")
            } else {
                qresSQL <- paste(qresSQL, "where tstamp between",  from, "and", to, ";")
            }
        }
        oceDebug(debug, "SQL query to fetch data: \"", qresSQL, "\"\n", sep="")
        qres <- DBI::dbSendQuery(con, qresSQL)
        # Now, get only the specified time range
        data <- DBI::dbFetch(qres, n=-1L)
        RSQLite::dbClearResult(qres)
        timeOrder <- order(data$tstamp)
        oceDebug(debug, "before trimming data, it has dimension ", paste(dim(data), collapse="x"), "\n")
        data <- data[timeOrder, ]
        oceDebug(debug, "after trimming data, it has dimension ", paste(dim(data), collapse="x"), "\n")
        tstamp <- data[, 1]
        time <- numberAsPOSIXct(as.numeric(tstamp)/1000, type="unix", tz=tz)
        oceDebug(debug, "at rsk.R bookmark A 3 of 3: time[1] is ", format(time[1], "%Y-%m-%d %H:%M:%S (UTC%z, i.e. %Z)"), "\n")
        # Need to check if there is a datasetID column (for rskVersion >= 1.12.2)
        # If so, for now just extract it from the data matrix
        hasDatasetID <- sum(grep("datasetID", names(data))) > 0
        if (hasDatasetID) {
            datasetID <- data[, grep("datasetID", names(data))]
            data <- data[, -grep("datasetID", names(data)), drop=FALSE]
        }
        data <- data[, -1L, drop=FALSE] # drop the corrupted time column
        # Get column names from the 'channels' table.
        names <- tolower(RSQLite::dbReadTable(con, "channels")$longName)
        # FIXME: some longnames have UTF-8 characters, and/or
        # spaces. Should coerce to ascii with no spaces, or at least
        # recognize fields and rename (e.g. `dissolved O2` should
        # just be `oxygen`)
        names <- gsub(" ", "", names, fixed = TRUE) # remove spaces
        Encoding(names) <- "latin1"
        names <- iconv(names, "latin1", "ASCII", sub="")
        # if dissolvedo is a name call it oxygen
        names[which(match(names, "dissolvedo") == 1)] <- "oxygen"
        channelsTable <- RSQLite::dbReadTable(con, "channels")
        if ("isMeasured" %in% names(channelsTable)) {
            isMeasured <- channelsTable$isMeasured == 1
        } else {
            isMeasured <- channelsTable$isDerived == 0
            #warning("old Ruskin file detected; if problems arise, update file with Ruskin software")
        }
        dataNamesOriginal <- c("-", channelsTable$shortName[isMeasured])
        #1491> message("below is dataNamesOriginal: (at start)");print(dataNamesOriginal)
        #[issue 1483] print(cbind(channelsTable,isMeasured))
        names <- names[isMeasured] # only take names of things that are in the data table
        unitsRsk <- channelsTable$units[isMeasured]
        # Check for duplicated names, and append digits to make unique
        if (sum(duplicated(names)) > 0) {
            for (n in names) {
                dup <- match(names, n, nomatch=0)
                if (sum(dup) > 1) {
                    names[which(dup==1)] <- paste0(n, c("", seq(2, sum(dup))))
                }
            }
        }
        names(data) <- names
        data <- as.list(data)
        instruments <- RSQLite::dbReadTable(con, "instruments")
        serialNumber <- instruments$serialID
        model <- instruments$model
        schedules <- RSQLite::dbReadTable(con, "schedules")
        sampleInterval <- schedules$samplingPeriod/1000 # stored as milliseconds in rsk
        RSQLite::dbDisconnect(con)
        res@data$time <- time
        res@metadata$dataNamesOriginal <- dataNamesOriginal
        for (iname in seq_along(names)) {
            res@data[[names[iname]]] <- data[[names[iname]]]
            res@metadata$units[[names[iname]]] <- unitFromStringRsk(unitsRsk[iname])
            if (debug > 1) {
                # FIXME: developer sets this for temporary (and undocumented) debugging
                cat("\n***\nUNIT CHECK. The rsk string", unitsRsk[iname], "yielded as follows:\n")
                print(res@metadata$units[[names[iname]]])
                cat("***\n")
            }
        }
        res@data$tstamp <- tstamp
        res@metadata$dataNamesOriginal <- c(res@metadata$dataNamesOriginal, "tstamp")
        # Possibly add longitude and latitude to data slot. For ideas on how to
        # do that, see https://github.com/dankelley/oce/issues/2024
        if (!is.null(geodata)) {
            geodata$time <- numberAsPOSIXct(geodata$tstamp/1e3, type="unix")+tzOffsetLocation*3600
            look <- if ("origin" %in% names(geodata)) {
                geodata$origin == "auto"
            } else {
                rep(TRUE, length(geodata$tstamp))
            }
            #res@metadata$latitudeOld <- approx(geodata$time, geodata$latitude, res@data$time)$y
            res@metadata$latitude <- approx(geodata$time[look], geodata$latitude[look], res@data$time, rule=2)$y
            #res@metadata$longitudeOld <- approx(geodata$time, geodata$longitude,
            #res@data$time)$y
            res@metadata$longitude <- approx(geodata$time[look], geodata$longitude[look], res@data$time, rule=2)$y
            #message("lon-lat may be wrong; see https://github.com/dankelley/oce/issues/2024#issuecomment-1345373099")
            #message("TEST: examine both longitude and longitudeNew etc")
        }
        res@metadata$units$pressure$scale <- "absolute"
        #1491> message("res@metadata$dataNamesOriginal L909:");print(res@metadata$dataNamesOriginal)
        #?browser()
        if ("pressure" %in% names) {
            # possibly compute sea pressure
            if (is.logical(patm)) {
                if (patm) {
                    # This code is a bit tricky because we modify existing pressure in-place
                    dataNames <- names(res@data)
                    dataNames[dataNames=="pressure"] <- "pressureOriginal"
                    names(res@data) <- dataNames
                    res@metadata$units$pressureOriginal <- list(unit=expression(dbar), scale="absolute")
                    res@data$pressure <- res@data$pressureOriginal - 10.1325
                    res@metadata$units$pressure <- list(unit=expression(dbar), scale="sea")
                    res@metadata$dataNamesOriginal <- c(res@metadata$dataNamesOriginal, "-")
                    res@metadata$pressureType <- "sea"
                    oceDebug(debug, "patm=TRUE, so removing std atmospheric pressure, 10.1325 dbar\n")
                }
            } else if (is.numeric(patm)) {
                npatm <- length(patm)
                if (1L < npatm && npatm != length(pressure))
                    stop("if patm is numeric, its length must equal 1, or the length(pressure).")
                oceDebug(debug, "patm is numeric, so subtracting it from pressure\n")
                # This code is a bit tricky because we modify existing pressure in-place
                dataNames <- names(res@data)
                dataNames[dataNames=="pressure"] <- "pressureOriginal"
                names(res@data) <- dataNames
                res@metadata$units$pressureOriginal <- list(unit=expression(dbar),
                    scale="absolute")
                res@data$pressure <- res@data$pressureOriginal - patm
                res@metadata$units$pressure <- list(unit=expression(dbar), scale="sea")
                res@metadata$dataNamesOriginal <- c(res@metadata$dataNamesOriginal, "-")
                res@metadata$pressureType <- "sea"
            } else {
                stop("patm must be logical or numeric")
            }
        }
        res@metadata$model <- model
        res@metadata$serialNumber <- serialNumber
        res@metadata$sampleInterval <- sampleInterval
        res@metadata$rskVersion <- rskVersion
        res@metadata$ruskinVersion <- ruskinVersion
        names(res@metadata$dataNamesOriginal) <- names(res@data)
        if (hasDatasetID) {
            res@metadata$datasetID <- datasetID
        }
        # There is actually no need to set the conductivity unit since new()
        # sets it, but do it anyway, as a placeholder to show where to do
        # this, in case some RBR devices use different units.
        if ("cond12" %in% names(res@data)) {
            # [issue 1483] Change the name, and possibly unit, of 'cond12'
            #
            # For a sample file, channelsTable gives the unit for cond12 as
            # NA.  Rather than make unitFromStringRsk() give a conductivity
            # unit whenever it gets an NA value (which might occur for other
            # fields -- who knows?), we will switch NA to uS/cm because
            # that seems to be the usual unit for RBR instruments. However,
            # if the cond12 unit is not NA, we will leave it as it is, on the
            # assumption that unitFromStringRsk() has already interpreted
            # it correctly
            w <- which("cond12" == names)[1]
            if (is.na(unitsRsk[w])) {
                res@metadata$units$cond12 <- NULL # remove existing
                res@metadata$units$conductivity <- list(unit=expression(mS/cm), scale="")
            }
            newnames <- gsub("cond12", "conductivity", names(res@data))
            names(res@data) <- newnames
            names(res@metadata$dataNamesOriginal) <- newnames
        } else {
            # FIXME: will this work for all RBR rsks that don't contain cond12?
            res@metadata$units$conductivity <- list(unit=expression(mS/cm), scale="")
        }
        res@metadata$pressureAtmospheric <- pressureAtmospheric
        res@processingLog <- processingLogAppend(res@processingLog, paste(deparse(match.call()), sep="", collapse=""))
        oceDebug(debug, "} # read.rsk()\n", sep="", unindent=1)
        return(res)
    } else if (!(missing(type)) && type=="txt") {
        oceDebug("RBR txt format\n")
        oceDebug(debug, "Format is Rtext Ruskin txt export", "\n")
        # FIXME: reading a  lot if there are lots of "Events". Is there a better way to do this?
        l <- readLines(file, n=50000)
        pushBack(l, file)
        model <- unlist(strsplit(l[grep("Model", l)], "="))[2]
        serialNumber <- as.numeric(unlist(strsplit(l[grep("Serial", l)], "="))[2])
        sampleInterval <- 1/as.numeric(gsub("Hz", "", unlist(strsplit(l[grep("SamplingPeriod", l)], "="))[2]))
        numberOfChannels <- as.numeric(unlist(strsplit(l[grep("NumberOfChannels", l)], "="))[2])
        oceDebug(debug, "Model: ", model, "\n")
        oceDebug(debug, "serialNumber: ", serialNumber, "\n")
        oceDebug(debug, "sampleInterval: ", sampleInterval, "\n")
        oceDebug(debug, "File has ", numberOfChannels, "channels", "\n")
        channelNames <- NULL
        for (iChannel in 1:numberOfChannels)
            channelNames <- c(channelNames,
                tolower(unlist(strsplit(l[grep(paste0("Channel\\[", iChannel, "\\]"), l)], "="))[2]))
        oceDebug(debug, "Channel names are:", channelNames, "\n")
        skip <- grep("Date & Time", l)      # Where should I start reading the data?
        oceDebug(debug, "Data starts on line", skip, "\n")
        d <- read.table(file, skip=skip, stringsAsFactors=FALSE, encoding=encoding)
        oceDebug(debug, "First time=", d$V1[1], d$V2[1], "\n")
        # Assume date and time are first two columns
        time <- as.POSIXct(paste(d$V1, d$V2), format="%d-%b-%Y %H:%M:%OS", tz=tz)
        n <- length(time)
        channels <- list()
        for (iChannel in 1:numberOfChannels)
            channels[[iChannel]] <- d[, iChannel+2]
        names(channels) <- channelNames
        # Now do subsetting
        if (inherits(from, "POSIXt") || inherits(from, "character")) {
            if (!inherits(to, "POSIXt") && !inherits(to, "character"))
                stop("if 'from' is POSIXt or character, then 'to' must be, also")
            if (to <= from)
                stop("cannot have 'to' <= 'from'")
            from <- as.numeric(difftime(as.POSIXct(from, tz=tz), measurementStart, units="secs")) / measurementDeltat
            oceDebug(debug, "inferred from =", format(from, width=7), " based on 'from' arg", from.keep, "\n")
            to <- as.numeric(difftime(as.POSIXct(to, tz=tz), measurementStart, units="secs")) / measurementDeltat
            oceDebug(debug, "inferred   to =",   format(to, width=7), " based on   'to' arg", to.keep, "\n")
        } else {
            if (from < 1)
                stop("cannot have 'from' < 1")
            if (!missing(to) && to < from)
                stop("cannot have 'to' < 'from'")
        }
        oceDebug(debug, "from=", from, "\n")
        oceDebug(debug, "to=", if (missing(to)) "(not given)" else format(to), "\n")
        oceDebug(debug, "by=", by, "\n")
        if (inherits(by, "character"))
            by <- ctimeToSeconds(by)/sampleInterval # FIXME: Is this right?
        oceDebug(debug, "inferred by=", by, "samples\n")
        # subset times
        if (inherits(from, "POSIXt") || inherits(from, "character")) {
            keep <- from <= time & time <= to # FIXME: from may be int or time
        } else {
            if (missing(to)) {
                keep <- seq.int(from, n, by)
            } else {
                keep <- seq.int(from, to, by)
            }
        }
        oceDebug(debug, "will be skipping time with seq(..., by=", by, ")\n")
        time <- time[keep]
        channelsSub <- list()
        for (iChannel in 1:numberOfChannels) {
            channelsSub[[iChannel]] <- channels[[iChannel]][keep]
        }
        names(channelsSub) <- channelNames
        res <- as.rsk(time, columns=channelsSub,
            instrumentType="rbr",
            serialNumber=serialNumber, model=model,
            sampleInterval=sampleInterval,
            filename=filename,
            debug=debug-1)
    } else {
        # to read the "old" TDR files
        while (TRUE) {
            line <- scan(file, what="char", sep="\n", n=1, quiet=TRUE)
            if (0 < (r<-regexpr("Temp[ \t]*Pres", line))) # nolint
                break
            header <- c(header, line)
            if (0 < (r<-regexpr("Logging[ \t]*start", line))) {
                l <- sub("[ ]*Logging[ \t]*start[ ]*", "", line)
                measurementStart <- as.POSIXct(strptime(l, "%y/%m/%d %H:%M:%S", tz=tz))
            }
            # "Logging end" would seem to be the sensible thing to examine,
            # but "Logger time" seems correct in SLEIWEX 2008 data.  I think
            # the issue is that the devices were turned off manually, and
            # that time (the relevant one) is in "Logger time".
            if (0 < (r<-regexpr("Logger[ \t]*time", line))) {
                l <- sub("[ ]*Logger[ \t]*time[ ]*", "", line)
                measurementEnd <- as.POSIXct(strptime(l, "%y/%m/%d %H:%M:%S", tz=tz))
            }
            if (0 < (r<-regexpr("Sample[ \t]*period", line))) {
                l <- sub("[ ]*Sample[ \t]*period[ ]*", "", line)
                sp <- as.numeric(strsplit(l, ":")[[1]])
                measurementDeltat <- (sp[3] + 60 * (sp[2] + 60*sp[1]))
            }
        }
        oceDebug(debug, "measurementStart =", format(measurementStart), "\n")
        oceDebug(debug, "measurementEnd =", format(measurementEnd), "\n")
        oceDebug(debug, "measurementDeltat  =", measurementDeltat, "\n")
        serialNumber <- strsplit(header[1], "[\t ]+")[[1]][4]
        oceDebug(debug, "serialNumber=", serialNumber, "\n")
        # Now that we know the logging times, we can work with 'from 'and 'to'
        if (inherits(from, "POSIXt") || inherits(from, "character")) {
            if (!inherits(to, "POSIXt") && !inherits(to, "character"))
                stop("if 'from' is POSIXt or character, then 'to' must be, also")
            if (to <= from)
                stop("cannot have 'to' <= 'from'")
            from <- as.numeric(difftime(as.POSIXct(from, tz=tz), measurementStart, units="secs")) / measurementDeltat
            oceDebug(debug, "inferred from =", format(from, width=7), " based on 'from' arg", from.keep, "\n")
            to <- as.numeric(difftime(as.POSIXct(to, tz=tz), measurementStart, units="secs")) / measurementDeltat
            oceDebug(debug, "inferred   to =",   format(to, width=7), " based on   'to' arg", to.keep, "\n")
        } else {
            if (from < 1)
                stop("cannot have 'from' < 1")
            if (!missing(to) && to < from)
                stop("cannot have 'to' < 'from'")
        }
        oceDebug(debug, "by=", by, "in argument list\n")
        by <- ctimeToSeconds(by)
        oceDebug(debug, "inferred by=", by, "s\n")
        # col.names <- strsplit(gsub("[ ]+"," ", gsub("[ ]*$","",gsub("^[ ]+","",line))), " ")[[1]]
        # Read a line to determine if there is a pair of columns for time
        line <- scan(file, what="char", sep="\n", n=1, quiet=TRUE)
        pushBack(line, file)
        line <- gsub("[ ]+$", "", gsub("^[ ]+", "", line))
        nvar <- length(strsplit(line, "[ ]+")[[1]])
        oceDebug(debug, " data line '", line, "' reveals ", nvar, " data per line\n", sep="")
        d <- scan(file, character(), quiet=TRUE) # read whole file (it's too tricky to bisect times with text data)
        n <- length(d) / nvar
        oceDebug(debug, "file has", length(d), "items; assuming", nvar, "items per line, based on first line\n")
        dim(d) <- c(nvar, n)
        if (nvar == 2) {
            time <- measurementStart + seq(from=0, to=n-1) * measurementDeltat
            Tcol <- 1
            pcol <- 2
        } else if (nvar == 4) {
            # This time conversion is the slowest part of this function.  With R 2.13.0a working on
            # a 620524-long vector: strptime() took 24s on a particular machine, and
            # as.POSIXct() took 104s.  So, use strptime(), if the first time seems
            # to be in a stanadard format.
            if (1 == grepl("[0-9]{4}/[0-3][0-9]/[0-3][0-9]", d[1, 1])) {
                time <- strptime(paste(d[1, ], d[2, ]), format="%Y/%m/%d %H:%M:%S", tz=tz)
            } else {
                time <- as.POSIXct(paste(d[1, ], d[2, ]), tz=tz)
            }
            Tcol <- 3
            pcol <- 4
        } else if (nvar == 5) {
            # 2008/06/25 10:00:00   18.5260   10.2225    0.0917
            if (1 == grepl("[0-9]{4}/[0-3][0-9]/[0-3][0-9]", d[1, 1])) {
                time <- strptime(paste(d[1, ], d[2, ]), format="%Y/%m/%d %H:%M:%S", tz=tz)
            } else {
                time <- as.POSIXct(paste(d[1, ], d[2, ]), tz=tz)
            }
            Tcol <- 3
            pcol <- 4
        } else
            stop("wrong number of variables; need 2, 4, or 5, but got ", nvar)    # subset
        # subset times
        if (inherits(from, "POSIXt") || inherits(from, "character")) {
            keep <- from <= time & time <= to # FIXME: from may be int or time
        } else {
            if (missing(to)) {
                look <- from:n
            } else {
                look <- from:to
            }
        }
        oceDebug(debug, "will be skipping time with seq(..., by=", by, ")\n")
        look <- seq.int(1, dim(d)[2], by=by)
        time <- time[look]
        temperature <- as.numeric(d[Tcol, look])
        pressure <- as.numeric(d[pcol, look])
        model <- ""
        res <- as.rsk(time, columns=list(temperature=temperature, pressure=pressure),
            instrumentType="rbr",
            serialNumber=serialNumber, model=model,
            filename=filename,
            debug=debug-1)
    }
    if (is.logical(patm)) {
        if (patm) {
            res@data$pressureOriginal <- res@data$pressure
            res@data$pressure <- res@data$pressure - 10.1325
            # No need to check patm=FALSE case because object default is "absolute"
            res@metadata$pressureType <- "sea"
        }
    } else if (is.numeric(patm)) {
        res@data$pressureOriginal <- res@data$pressure
        res@data$pressure <- res@data$pressure - patm[1]
        res@metadata$pressureType <- "sea"
    } else {
        stop("patm must be logical or numeric")
    }
    if (missing(processingLog))
        processingLog <- paste(deparse(match.call()), sep="", collapse="")
    res@processingLog <- processingLogAppend(res@processingLog, processingLog)
    oceDebug(debug, "} # read.rsk()\n", sep="", unindent=1)
    res
}


#' Create a ctd Object from an rsk Object
#'
#' A new `ctd` object is assembled from the contents of the `rsk` object.
#' The data and metadata are mostly unchanged, with an important exception: the
#' `pressure` item in the `data` slot may altered, because `rsk`
#' instruments measure total pressure, not sea pressure; see \dQuote{Details}.
#'
#' The `pressureType` element of the
#' `metadata` of `rsk` objects defines the pressure type, and this controls
#' how pressure is set up in the returned object. If `object@@metadata$pressureType`
#' is `"absolute"` (or `NULL`) then the resultant pressure will be adjusted
#' to make it into `"sea"` pressure. To do this, the value of
#' `object@@metadata$pressureAtmospheric` is inspected. If this is present, then
#' it is subtracted from `pressure`. If this is missing, then
#' standard pressure (10.1325 dbar) will be subtracted. At this stage, the
#' pressure should be near zero at the ocean surface, but some additional adjustment
#' might be necessary, and this may be indicated by setting the argument `pressureAtmospheric` to
#' a non-zero value to be subtracted from pressure.
#'
#' @param x an [rsk-class] object.
#'
#' @param pressureAtmospheric A numerical value (a constant or a vector),
#' that is subtracted from the pressure in `object` before storing it in the return value.
#'
#' @param longitude numerical value of longitude, in degrees East.
#'
#' @param latitude numerical value of latitude, in degrees North.
#'
#' @param ship optional string containing the ship from which the observations were made.
#'
#' @param cruise optional string containing a cruise identifier.
#'
#' @param station optional string containing a station identifier.
#'
#' @param deploymentType character string indicating the type of deployment (see
#' [as.ctd()]).
#'
#' @template debugTemplate
rsk2ctd <- function(x, pressureAtmospheric=0, longitude=NULL, latitude=NULL,
    ship=NULL, cruise=NULL, station=NULL, deploymentType=NULL,
    debug=getOption("oceDebug"))
{
    oceDebug(debug, "rsk2ctd(...) {\n", sep="", unindent=1)
    res <- new("ctd")
    res@metadata <- x@metadata
    # The user may have already inserted some metadata, even if read.rsk() didn't, so
    # we have to take care of two cases in deciding on some things. The procedure is
    # to use the argument to rsk2ctd if one is given, otherwise to use the value already
    # in x@metadata, otherwise to set a default that matches as.ctd().
    res@metadata$longitude <- if (!is.null(longitude)) {
        longitude
    } else {
        if (is.null(res@metadata$longitude)) {
            NA
        } else {
            res@metadata$longitude
        }
    }
    res@metadata$latitude <- if (!is.null(latitude)) {
        latitude
    } else {
        if (is.null(res@metadata$latitude)) {
            NA
        } else {
            res@metadata$latitude
        }
    }
    res@metadata$ship <- if (!is.null(ship)) {
        ship
    } else {
        if (is.null(res@metadata$ship)) {
            ""
        } else {
            res@metadata$ship
        }
    }
    res@metadata$cruise <- if (!is.null(cruise)) {
        cruise
    } else {
        if (is.null(res@metadata$cruise)) {
            ""
        } else {
            res@metadata$cruise
        }
    }
    res@metadata$station <- if (!is.null(station)) {
        station
    } else {
        if (is.null(res@metadata$station)) {
            ""
        } else {
            res@metadata$station
        }
    }
    res@metadata$deploymentType <- if (!is.null(deploymentType)) {
        deploymentType
    } else {
        if (is.null(res@metadata$deploymentType)) {
            "unknown"
        } else {
            res@metadata$deploymentType
        }
    }
    # We start by copying the data, but we may need to do some fancy footwork
    # for pressure, because RBR devices store absolute pressure, not the sea
    # pressure that we have in CTD objects.
    res@data <- x@data
    if (!("pressure" %in% names(res@data)))
        stop("there is no pressure in this rsk object, so it cannot be converted to a ctd object")
    pressureAtmosphericStandard <- 10.1325
    if (is.null(x@metadata$pressureType)) {
        oceDebug(debug, "metadata$pressureType is NULL so guessing absolute pressure:\n")
        warning("rsk object lacks metadata$pressureType; assuming absolute and subtracting standard atm pressure to get sea pressure")
        res@data$pressure <- x@data$pressure - pressureAtmosphericStandard
        res@metadata$units$pressure$scale <- "sea"
        res@metadata$dataNamesOriginal[substr(res@metadata$dataNamesOriginal, 1, 4) == "pres"] <- ""
        res@processingLog <- processingLogAppend(res@processingLog,
            paste("subtracted 10.1325dbar (std atm) from pressure\n"))
    } else {
        # subtract atm pressure, if it has not already been subtracted
        oceDebug(debug, "metadata$pressureType is not NULL\n")
        if ("sea" != substr(x@metadata$pressureType, 1, 3)) {
            oceDebug(debug, "must convert from absolute pressure to sea pressure\n")
            if (!("pressureAtmospheric" %in% names(x@metadata))) {
                oceDebug(debug, "pressure is 'absolute'; subtracting std atm 10.1325 dbar\n")
                res@data$pressure <- x@data$pressure - pressureAtmosphericStandard
                res@metadata$units$pressure$scale <- "sea"
                res@metadata$dataNamesOriginal[substr(res@metadata$dataNamesOriginal, 1, 4) == "pres"] <- ""
                res@processingLog <- processingLogAppend(res@processingLog,
                    paste("subtracted", pressureAtmosphericStandard, "dbar (std atm) from absolute pressure to get sea pressure"))
                oceDebug(debug, "subtracted std atm pressure from pressure\n")
            } else {
                res@data$pressure <- x@data$pressure - x@metadata$pressureAtmospheric
                res@metadata$units$pressure$scale <- "sea"
                res@metadata$dataNamesOriginal[substr(res@metadata$dataNamesOriginal, 1, 4) == "pres"] <- ""
                res@processingLog <- processingLogAppend(res@processingLog,
                    paste("subtracted",
                        x@metadata$pressureAtmospheric,
                        "dbar from absolute pressure to get sea pressure"))
                oceDebug(debug, "subtracted", x@metadata$pressureAtmospheric, "dbar from pressure\n")
            }
        }
    }
    # Now we have sea pressure (if the rsk was set up correctly for the above to
    # work right), so we can adjust a second time, if the user changed from the
    # default of pressureAtmospheric=0.
    if (pressureAtmospheric[1] != 0) {
        res@data$pressure <- res@data$pressure - pressureAtmospheric
        oceDebug(debug, "subtracted", pressureAtmospheric, "dbar from pressure")
    }
    res@processingLog <- processingLogAppend(res@processingLog,
        paste("subtracted", pressureAtmospheric, "dbar from sea pressure"))
    if (!("salinity" %in% names(x@data))) {
        C <- x[["conductivity"]]
        if (is.null(C)) {
            stop("objects must have salinity or conductivity to be converted to CTD form")
        }
        unit <- as.character(x@metadata$units$conductivity$unit)
        if (0 == length(unit)) {
            S <- swSCTp(x[["conductivity"]], x[["temperature"]], res[["pressure"]])
            res@processingLog <- processingLogAppend(res@processingLog, "calculating salinity based on conductivity in (assumed) ratio units")
        } else if (unit == "uS/cm") {
            S <- swSCTp(x[["conductivity"]]/42914.0, x[["temperature"]], res[["pressure"]])
            res@processingLog <- processingLogAppend(res@processingLog, "calculating salinity based on conductivity in uS/cm")
        } else if (unit == "mS/cm") {
            S <- swSCTp(x[["conductivity"]]/42.914, x[["temperature"]], res[["pressure"]])
            res@processingLog <- processingLogAppend(res@processingLog, "calculating salinity based on conductivity in mS/cm")
        } else if (unit == "S/m") {
            S <- swSCTp(x[["conductivity"]]/4.2914, x[["temperature"]], res[["pressure"]])
            res@processingLog <- processingLogAppend(res@processingLog, "calculating salinity based on conductivity in S/m")
        } else {
            stop("unrecognized conductivity unit '", unit, "'; only uS/cm, mS/cm and S/m are handled")
        }
        res <- oceSetData(res, name="salinity", value=S,
            unit=list(unit=expression(), scale="PSS-78"))
    }
    oceDebug(debug, "} # rsk2ctd()\n", sep="", unindent=1)
    res@processingLog <- processingLogAppend(res@processingLog,
        paste("rsk2ctd(..., pressureAtmospheric=", pressureAtmospheric, ", debug)\n",
            sep="", collapse=""))
    res
}


#' Estimate Atmospheric Pressure in Rsk Object
#'
#' Estimate atmospheric pressure in an [rsk-class] object.  Pressures must be in
#' decibars for this to work.  First, a subset of pressures is created, in which
#' the range is `sap-dp` to `sap+dp`.  Here, `sap`=10.1325 dbar is standard
#' sealevel atmospheric pressure.  Within this window, three measures of central
#' tendency are calculated: the median, the mean, and a weighted mean that has
#' weight given by
#' \eqn{exp(-2*((p-sap)/dp)^2)}{exp(-2*((p-sap)/dp)^2)}.
#'
#' @param x an [rsk-class] object.
#'
#' @param dp Half-width of pressure window to be examined (in decibars).
#'
#' @return A list of four estimates: `sap`, the median, the mean, and the weighted mean.
#'
#' @seealso
#' The documentation for [rsk-class] explains the structure of
#' `rsk` objects, and also outlines the other functions dealing with them.
#'
#' @examples
#' library(oce)
#' data(rsk)
#' print(rskPatm(rsk))
#'
#' @author Dan Kelley
#'
#' @family things related to rsk data
rskPatm <- function(x, dp=0.5)
{
    p <- if (inherits(x, "rsk")) x@data$pressure else x
    sap <- 10.1325                      # standard atm pressure
    if (length(p) < 1)
        return(rep(sap, 4))
    p <- p[(sap - dp) <= p & p <= (sap + dp)] # window near sap
    w <- exp(-2 * ((p - sap) / dp)^2)
    if (length(p) < 4) {
        rep(sap, 4)
    } else {
        c(sap, median(p), mean(p), weighted.mean(p, w))
    }
}

#' Decode table-of-contents File from a Rsk File
#'
#' Decode table-of-contents file from a rsk file, of the sort used by some
#' researchers at Dalhousie University.
#'
#' It is assumed that the `.TBL` file contains lines of the form \code{"File
#'   \\day179\\SL08A179.023 started at Fri Jun 27 22:00:00 2008"} The first step is
#' to parse these lines to get day and hour information, i.e.  179 and 023 in the
#' line above.  Then, recognizing that it is common to change the names of such
#' files, the rest of the file-name information in the line is ignored, and instead
#' a new file name is constructed based on the data files that are found in the
#' directory.  (In other words, the `"\\day179\\SL08A"` portion of the line is
#' replaced.)  Once the file list is complete, with all times put into R format,
#' then (optionally) the list is trimmed to the time interval indicated by
#' `from` and `to`.  It is important that `from` and `to` be in
#' the `UTC` time zone, because that time zone is used in decoding the lines
#' in the `.TBL` file.
#'
#' @param dir name of a directory containing a single table-of-contents file, with
#' `.TBL` at the end of its file name.
#'
#' @param from optional [POSIXct()] time, indicating the beginning of a
#' data interval of interest.  This must have timezone `"UTC"`.
#'
#' @param to optional [POSIXct()] time, indicating the end of a data
#' interval of interest.  This must have timezone `"UTC"`.
#'
#' @param debug optional integer to control debugging, with positive values
#' indicating to print information about the processing.
#'
#' @examples
#'\dontrun{
#' table <- rskToc("/data/archive/sleiwex/2008/moorings/m05/adv/sontek_202h/raw",
#' from=as.POSIXct("2008-07-01 00:00:00", tz="UTC"),
#'     to=as.POSIXct("2008-07-01 12:00:00", tz="UTC"))
#' print(table)
#'}
#'
#' @return
#' A list with two elements: `filename`, a vector of file names, and
#' `startTime`, a vector of [POSIXct()] times indicating the (real)
#' times of the first datum in the corresponding files.
#'
#' @author Dan Kelley
#'
#' @family things related to rsk data
rskToc <- function(dir, from, to, debug=getOption("oceDebug"))
{
    if (missing(dir))
        stop("need a 'dir', naming a directory containing a file with suffix .TBL, and also data files named in that file")
    tbl.files <- list.files(path=dir, pattern="*.TBL$")
    if (length(tbl.files) < 1)
        stop("could not locate a .TBL file in direcory ", dir)
    tref <- as.POSIXct("2010-01-01", tz="UTC") # arbitrary time, to make integers
    file.code <- NULL
    startTime <- NULL
    for (tbl.file in tbl.files) {
        oceDebug(debug, tbl.file)
        lines <- readLines(paste(dir, tbl.file, sep="/"))
        if (length(lines) < 1)
            stop("found no data in file ", paste(dir, tbl.file, sep="/"))
        # "File \\day179\\SL08A179.023 started at Fri Jun 27 22:00:00 2008"
        for (line in lines) {
            s <- strsplit(line, "[ \t]+")[[1]]
            if (length(s) > 2) {
                filename <- s[2]
                month <- s[6]
                day <- s[7]
                hms <- s[8]
                year <- s[9]
                t <- as.POSIXct(strptime(paste(year, month, day, hms), "%Y %b %d %H:%M:%S", tz="UTC"))
                len <- nchar(filename)
                code <- substr(filename, len-6, len)
                oceDebug(debug, s, "(", code, format(t), ")\n")
                file.code <- c(file.code, code)
                startTime <- c(startTime, as.numeric(t) - as.numeric(tref))
            }
        }
    }
    prefix <- list.files(dir, pattern=".*[0-9]$")[1]
    lprefix <- nchar(prefix)
    prefix <- substr(prefix, 1, lprefix-7)
    filename <- paste(dir, paste(prefix, file.code, sep=""), sep="/")
    startTime <- as.POSIXct(startTime + tref)
    oceDebug(debug, "from=", format(from), "\n")
    oceDebug(debug, "to=", format(to), "\n")
    if (!missing(from) && !missing(to)) {
        oceDebug(debug, "got", length(file.code), "candidate files")
        ok <- from <= startTime & startTime <= to
        oceDebug(debug, "ok=", ok, "\n")
        filename <- filename[ok]
        startTime <- startTime[ok]
        oceDebug(debug, "taking into account the times, ended up with", length(file.code), "files\n")
    }
    list(filename=filename, startTime=startTime)
}

Try the oce package in your browser

Any scripts or data that you put into this service are public.

oce documentation built on July 9, 2023, 5:18 p.m.