R/loadSoundscapeData.R

Defines functions checkFreqNames checkManta checkTimeName checkTriton checkInfinite fixHmdLabels loadSoundscapeData

Documented in loadSoundscapeData

#' @title Load Soundscape Data
#'
#' @description Reads and checks data to ensure formatting will work
#'   for other \code{PAMscapes} functions. Will read and check the
#'   formatting of CSV files, or check the formatting of dataframes.
#'   Can also read in MANTA NetCDF files and format the data
#'   appropriately.
#'
#' @param x a dataframe, path to a CSV file, or path to a MANTA
#'   NetCDF file, or folder containing these. If \code{x} is a vector
#'   of file paths then all will be read in and combined. If \code{x}
#'   is a folder, then all files with extension \code{extension} will
#'   be loaded. Note this will not load files within subfolders, only
#'   the main folder.
#' @param needCols names of columns that must be present in \code{x},
#'   if any are missing will trigger an error
#' @param skipCheck logical flag to skip some data checking, recommended
#'   to keep as \code{FALSE}
#' @param timeBin amount of time to bin data by, format can
#'   be "#Unit" e.g. \code{'2hour'} or \code{'1day'}
#' @param binFunction summary function to apply to data in each time bin,
#'   default is "median"
#' @param binCount logical flag to return the number of times in
#'   each time bin as column "binCount"
#' @param octave one of "original", "tol", or "ol". If "original" then
#'   nothing happens, otherwise data are converted to Octave-leve ("ol")
#'   or Third-Octave-Level ("tol") measurements using
#'   \link{createOctaveLevel}
#' @param label optional, if not \code{NULL} then this value will be
#'   added as an additional column "label" to the output
#' @param keepQuals quality flag values to keep. Accepts vector of
#'   integers from (1, 2, 3, 4) corresponding to flag labels "Good",
#'   "Not evaluated/Unknown", "Compromised/Questionable", and "Unusable/Bad".
#'   HMD levels for points with data quality flags outside of \code{keepQuals}
#'   will be marked as \code{NA}.
#' @param keepEffort if \code{TRUE} or \code{FALSE}, a logical flag whether or
#'   not to keep the effort information with the outputs (number of seconds
#'   per minute). If a numeric value, then any minutes with an effort value
#'   less than \code{keepEffort} will be removed (e.g. \code{50} will remove
#'   minutes with less than 50 seconds of effort)
#' @param dropNonHmd logical flag to drop non-standard hybrid millidecade
#'   bands, only applies to HMD type data. Some datasets have frequency
#'   values that are not part of the standard HMD bands (e.g. at exactly
#'   the Nyquist rate), if \code{TRUE} these will be removed.
#' @param tz timezone of the data being loaded, will be converted to UTC
#'   after load
#' @param extension only used if \code{x} is a folder, the file extension
#'   to load. Must be one of "nc" or "csv"
#'
#' @details Files created by MANTA and Triton software will be
#'   reformatted to have consisitent formatting. The first column
#'   will be renamed to "UTC", and columns containing soundscape
#'   metrics will be named using the convention "TYPE_FREQUENCY",
#'   e.g. "HMD_1", "HMD_2" for Manta hybrid millidecade mesaurements.
#'
#'   Inputs from sources other than MANTA or Triton can be accepted
#'   in either "wide" or "long" format. Wide format must follow
#'   the conventions above - first column "UTC", other columns
#'   named by "TYPE_FREQUENCY" where TYPE is consistent across all
#'   columns and FREQUENCY is in Hertz. Long format data must have
#'   the following columns:
#'   \describe{
#'     \item{"UTC"}{ - time of the measurement, in UTC timezone}
#'     \item{"type"}{ - the type of soundscape measurement e.g.
#'       PSD or OL, must be the same for all}
#'     \item{"frequency"}{ - the frequency of the measurement, in Hertz}
#'     \item{"value"}{ - the soundscape measurement value, usually dB}
#'   }
#'
#' @author Taiki Sakai \email{taiki.sakai@@noaa.gov}
#'
#' @return a dataframe
#'
#' @examples
#'
#' manta <- loadSoundscapeData(system.file('extdata/MANTAExampleSmall1.csv', package='PAMscapes'))
#' str(manta)
#' ol <- loadSoundscapeData(system.file('extdata/OLSmall.csv', package='PAMscapes'))
#' str(ol)
#' psd <- loadSoundscapeData(system.file('extdata/PSDSmall.csv', package='PAMscapes'))
#' str(psd)
#'
#' @export
#'
#' @importFrom data.table fread setDF
#' @importFrom lubridate force_tz with_tz
#' @importFrom future.apply future_lapply
#'
loadSoundscapeData <- function(x,
                               needCols=c('UTC'),
                               skipCheck=FALSE,
                               timeBin=NULL,
                               binFunction='median',
                               binCount=FALSE,
                               octave=c('original', 'tol', 'ol'),
                               label=NULL,
                               keepQuals=c(1),
                               keepEffort=TRUE,
                               dropNonHmd=TRUE,
                               tz='UTC',
                               extension=c('nc', 'csv')) {
    if(is.character(x) &&
       length(x) == 1 &&
       dir.exists(x)) {
        ext <- switch(match.arg(extension),
                      'nc' = '\\.nc$',
                      'csv' = '\\.csv$'
        )
        x <- list.files(x, pattern=ext, full.names=TRUE)
    }
    octave <- match.arg(octave)
    allowedExt <- '\\.nc$|\\.csv$'
    if(is.character(x)) {
        x <- x[grepl(allowedExt, x)]
        if(length(x) == 0) {
            warning('No files of appropriate type provided.')
            return(NULL)
        }
    }
    # combine if multiple files
    if(is.character(x) &&
       length(x) > 1) {
        x <- bind_rows(future_lapply(x, function(f) {
            loadSoundscapeData(f, needCols=needCols, skipCheck=skipCheck,
                               timeBin=timeBin, binFunction=binFunction,
                               binCount=binCount,
                               octave=octave, label=label,
                               keepQuals=keepQuals, keepEffort=keepEffort,
                               dropNonHmd = FALSE,
                               tz=tz)
        }, future.seed=NULL))
        freqCols <- whichFreqCols(x)
        freqVals <- colsToFreqs(colnames(x)[freqCols])
        type <- gsub('([A-z]*)_.*', '\\1', colnames(x)[freqCols][1])
        # standardizing to round to integer on all HMD columns
        if(type == 'HMD') {
            standardHmd <- paste0('HMD_', round(freqVals, 0))
            colnames(x)[freqCols] <- standardHmd
            hmdLevels <- getHmdLevels(freqRange=range(freqVals)+c(-1, 1))
            nonStandard <- !standardHmd %in% hmdLevels$labels
            newLabs <- fixHmdLabels(freqVals[nonStandard], hmdLevels=hmdLevels)
            repeatLabs <- newLabs[!is.na(newLabs)] %in% colnames(x)
            if(any(repeatLabs)) {
                warning('Input does not appear to be standard hybrid millidecade,',
                        ' proceed with caution')
                newLabs[!is.na(newLabs)][repeatLabs] <- NA
            }
            colnames(x)[freqCols][nonStandard][!is.na(newLabs)] <- newLabs[!is.na(newLabs)]
            if(anyNA(newLabs) &&
               isTRUE(dropNonHmd)) {
                warning('Found ', sum(is.na(newLabs)), ' non-standard ',
                        'hybrid millidecade frequencies (',
                        printN(standardHmd[nonStandard][is.na(newLabs)], collapse=', '),
                        ') these will be removed. Run with "dropNonHmd=FALSE"',
                        ' to keep them.')
                for(col in standardHmd[nonStandard][is.na(newLabs)]) {
                    x[[col]] <- NULL
                }
            }
        }
        return(x)
    }
    if(is.character(x)) {
        if(!file.exists(x)) {
            warning('File ', x, ' does not exist.')
            return(NULL)
        }
        if(grepl('csv$', x, ignore.case=TRUE)) {
            # head <- strsplit(read_lines(x, n_max = 1), ', ')[[1]]
            # first <- strsplit(read_lines(x, n_max=1, skip=1), ', ')[[1]]
            readTop <- strsplit(readLines(x, n=2), ',')
            if(length(readTop[[1]]) < length(readTop[[2]])) {
                warning('File ', x, ' has more data columns than column headers. Cannot load.')
                return(NULL)
            }
            x <- fread(x, header=TRUE)
            setDF(x)
        } else if(grepl('nc$', x, ignore.case=TRUE)) {
            x <- loadMantaNc(x, keepQuals=keepQuals, keepEffort=keepEffort)
        }
    }
    colnames(x) <- checkTimeName(colnames(x))
    x <- checkManta(x, keepEffort=keepEffort)
    if(isFALSE(skipCheck)) {
        # x <- checkTriton(x)
        x <- checkInfinite(x)
    }
    missingCols <- needCols[!needCols %in% colnames(x)]
    if(length(missingCols) > 0) {
        warning('Required columns ', paste0(missingCols, collapse=', '),
                ' are missing.')
        return(NULL)
    }
    if(is.character(x$UTC)) {
        x$UTC <- parseToUTC(x$UTC)
    }
    if(tz != 'UTC') {
        x$UTC <- force_tz(x$UTC, tzone=tz)
        x$UTC <- with_tz(x$UTC, tzone='UTC')
    }
    if(!isWide(colnames(x)) && !isLong(colnames(x))) {
        warning('Input "x" could not be formatted properly.')
        return(NULL)
    }
    # for now this check is just fixing 31_5 to 31.5
    colnames(x) <- checkFreqNames(colnames(x))
    freqCols <- whichFreqCols(x)
    type <- gsub('([A-z]*)_.*', '\\1', colnames(x)[freqCols][1])
    # standardizing to round to integer on all HMD columns
    if(type == 'HMD') {
        freqVals <- colsToFreqs(colnames(x)[freqCols])
        standardHmd <- paste0('HMD_', round(freqVals, 0))
        colnames(x)[freqCols] <- standardHmd
        hmdLevels <- getHmdLevels(freqRange=range(freqVals)+c(-1, 1))
        nonStandard <- !standardHmd %in% hmdLevels$labels
        newLabs <- fixHmdLabels(freqVals[nonStandard], hmdLevels=hmdLevels)
        repeatLabs <- newLabs[!is.na(newLabs)] %in% colnames(x)
        if(any(repeatLabs)) {
            warning('Input does not appear to be standard hybrid millidecade,',
                    ' proceed with caution')
            newLabs[!is.na(newLabs)][repeatLabs] <- NA
        }
        colnames(x)[freqCols][nonStandard][!is.na(newLabs)] <- newLabs[!is.na(newLabs)]
        if(anyNA(newLabs) &&
           isTRUE(dropNonHmd)) {
            warning('Found ', sum(is.na(newLabs)), ' non-standard ',
                    'hybrid millidecade frequencies (',
                    printN(standardHmd[nonStandard][is.na(newLabs)], collapse=', '),
                    ') these will be removed. Run with "dropNonHmd=FALSE"',
                    ' to keep them.')
            for(col in standardHmd[nonStandard][is.na(newLabs)]) {
                x[[col]] <- NULL
            }
        }
    }
    if(!is.null(timeBin)) {
        x <- binSoundscapeData(x, bin=timeBin, method=binFunction, binCount=binCount)
    }
    if(octave != 'original') {
        x <- createOctaveLevel(x, type=octave)
    }
    if(!is.null(label)) {
        x$label <- label
    }
    x
}


# i hate this we cant round to the same level because of REASONS
# so fix by matching closest in round(0) cases
fixHmdLabels <- function(freqVals, hmdLevels=NULL) {
    if(is.null(hmdLevels)) {
        hmdLevels <- getHmdLevels(freqRange=range(freqVals) + c(-1, 1))
    }
    newLabels <- rep(NA, length(freqVals))
    for(i in seq_along(freqVals)) {
        diffs <- abs(hmdLevels$freqs - freqVals[i])
        whichMin <- which.min(diffs)
        minDiff <- diffs[whichMin]
        if(minDiff <= 1) {
            newLabels[i] <- hmdLevels$labels[whichMin]
        }
    }
    newLabels
}

checkInfinite <- function(x, doWarn=TRUE) {
    infCols <- sapply(x, function(c) any(is.infinite(c)))
    if(!any(infCols)) {
        return(x)
    }
    infIx <- which(infCols)
    if(doWarn) {
        warning('Found infinite values in "x", they will be replaced with NA.')
    }
    for(i in infIx) {
        x[[i]][is.infinite(x[[i]])] <- NA
    }
    x
}

checkTriton <- function(x) {
    tritonTime <- "yyyy-mm-ddTHH:MM:SSZ"
    if(tritonTime %in% colnames(x)) {
        colnames(x)[colnames(x) == tritonTime] <- 'UTC'
    }
    alternate <- 'yyyy.mm.ddTHH.MM.SSZ'
    if(alternate %in% colnames(x)) {
        colnames(x)[colnames(x) == alternate] <- 'UTC'
    }
    alternate <- 'yyyy_mm_ddTHH_MM_SSZ'
    if(alternate %in% colnames(x)) {
        colnames(x)[colnames(x) == alternate] <- 'UTC'
    }
    x
}

checkTimeName <- function(x) {
    if(is.data.frame(x)) {
        names <- checkTimeName(names(x))
        names(x) <- names
        return(x)
    }
    tritonTime <- "yyyy-mm-ddTHH:MM:SSZ"
    if(tritonTime %in% x) {
        x[x == tritonTime] <- 'UTC'
    }
    alternate <- 'yyyy.mm.ddTHH.MM.SSZ'
    if(alternate %in% x) {
        x[x == alternate] <- 'UTC'
    }
    alternate <- 'yyyy_mm_ddTHH_MM_SSZ'
    if(alternate %in% x) {
        x[x == alternate] <- 'UTC'
    }
    x
}

# colnames are d-m-y h:m:s, 0, 0-freq end
checkManta <- function(x, keepEffort=FALSE) {
    if(all(grepl('^X', colnames(x)))) {
        colnames(x) <- gsub('^X', '', colnames(x))
    }
    dateCol <- colnames(x)[1]
    mantaFormat <- c('%d-%b-%Y %H:%M:%S', '%m/%d/%Y %H:%M:%S',
                     '%d.%b.%Y.%H.%M.%S', '%m.%d.%Y.%H.%M.%S')
    tryConvert <- suppressWarnings(parse_date_time(dateCol, orders=mantaFormat, tz='UTC', truncated=2))
    # manta has the date as first column name, if we couldnt convert
    # then this isnt manta
    if(is.na(tryConvert)) {
        return(x)
    }
    # manta second col is seconds? only sometimes
    checkSeconds <- all(x[[2]] <= 60)
    secondCol <- grepl('^0\\.{3}[0-9]{1}$', colnames(x)[2]) ||
        (colnames(x)[2] == '0' & colnames(x)[3] %in% c('0', '0.1'))
    checkSeconds <- checkSeconds & secondCol
    freqIx <- 2:ncol(x)
    if(isTRUE(checkSeconds)) {
        if(isFALSE(keepEffort)) {
            x[[2]] <- NULL
            colnames(x)[2] <- '0'
            freqIx <- 2:ncol(x)
        } else if(isTRUE(keepEffort)) {
            colnames(x)[2:3] <- c('effortSeconds', '0')
            freqIx <- 3:ncol(x)
        }
    }
    # manta should have columns named just frequency for 2:ncol
    # if we cant convert w/o NA, then its not manta
    freqCols <- colnames(x)[freqIx]
    tryFreq <- suppressWarnings(as.numeric(freqCols))
    # sometimes written as 31_5 instead of 31.5?
    if(anyNA(tryFreq)) {
        freqCols <- gsub('_', '.', freqCols)
        tryFreq <- suppressWarnings(as.numeric(freqCols))
    }
    if(anyNA(tryFreq)) {
        return(x)
    }
    colnames(x)[1] <- 'UTC'
    if(is.character(x$UTC)) {
        x$UTC <- parse_date_time(x$UTC, orders=mantaFormat, tz='UTC', truncated=2)
    }
    colnames(x)[freqIx] <- paste0('HMD_', freqCols)
    x
}

checkFreqNames <- function(x) {
    if(is.data.frame(x)) {
        x <- colnames(x)
    }
    # grepl('[0-9]+_[0-9]+', x)
    x <- gsub('(.*)([0-9]+)_([0-9]+)', '\\1\\2.\\3', x)
    x
}

Try the PAMscapes package in your browser

Any scripts or data that you put into this service are public.

PAMscapes documentation built on April 4, 2025, 2:17 a.m.