PAMmisc: Miscellaneous Functions for Passive Acoustic Analysis

Documented in ncToData

#' @title Match Data From a Netcdf File
#'
#' @description Extracts all variables from a netcdf file matching Longitude,
#'   Latitude, and UTC coordinates in given dataframe
#'
#' @param data dataframe containing Longitude, Latitude, and UTC to extract matching
#'   variables from the netcdf file
#' @param nc name of a netcdf file
#' @param var (optional) character vector of variable names to match. If \code{NULL}, all
#'   variables present in \code{nc} will be used
#' @param buffer vector of Longitude, Latitude, and Time (seconds) to buffer around
#'   each datapoint. All values within the buffer will be used to report the mean,
#'   median, and standard deviation
#' @param FUN a vector or list of functions to apply to the data. Default is to apply
#'   mean, median, and standard deviation calculations
#' @param raw logical flag to return only the raw values of the variables. If \code{TRUE}
#'   the output will be changed to a list with length equal to the number of data points.
#'   Each item in the list will have separate named entries for each variable that will have
#'   all values within the given buffer and all values for any Z coordinates present.
#' @param keepMatch logical flag to keep the matched coordinates, these are useful to make sure
#'   the closest point is actually close to your XYZT
#' @param progress logical flag to show progress bar for matching data
#' @param depth depth values (meters) to use for matching, overrides any \code{Depth} column
#'   in the data or can be used to specify desired depth range when not present in data.
#'   Variables will be summarised over the range of these depth values. \code{NULL}
#'   uses all available depth values
#' @param verbose logical flag to show warning messages for possible coordinate mismatch
#' @param \dots not used
#'
#' @return original dataframe with three attached columns for each variable in the netcdf
#'   file, one for each of mean, median, and standard deviation of all values within the buffer
#'
#' @author Taiki Sakai \email{taiki.sakai@@noaa.gov}
#'
#' @examples
#'
#' data <- data.frame(Latitude = 32, Longitude = -117,
#'                    UTC = as.POSIXct('2005-01-01 00:00:00', tz='UTC'))
#' nc <- system.file('extdata', 'sst.nc', package='PAMmisc')
#' # default calculates mean
#' ncToData(data, nc = nc)
#' # calculate mean, median, and sd
#' ncToData(data, nc=nc, FUN=c(mean, median, sd), buffer = c(.01, .01, 86400))
#' # custom function
#' meanPlusOne <- function(x) {
#'     mean(x, na.rm=TRUE) + 1
#' }
#' ncToData(data, nc=nc, FUN=c(mean, meanPlusOne))
#'
#' @importFrom ncdf4 nc_open nc_close ncvar_get
#' @importFrom dplyr bind_rows bind_cols
#' @importFrom purrr transpose
#'
#' @export
#'
ncToData <- function(data, nc, var=NULL, buffer = c(0,0,0), FUN = c(mean),
                     raw = FALSE, keepMatch=TRUE, progress=TRUE, depth=0, verbose=TRUE, ...) {
    if(!is.character(nc) ||
       !file.exists(nc)) {
        warning('"nc" ', nc, ' is not a valid file.')
        if(isTRUE(raw)) {
            return(vector('list', length=nrow(data)))
        } else {
            return(data)
        }
    }
    nc <- nc_open(nc)
    on.exit(nc_close(nc))
    nc <- romsCheck(nc)
    if(isStationary(nc)) {
        return(stationaryNcToData(data, nc, var, buffer, FUN, raw, keepMatch, progress, depth,
                                  verbose, ...))
    }
    oldNames <- colnames(data)
    colnames(data) <- standardCoordNames(oldNames)
    # if(all(is.na(data$Longitude)) ||
    #    all(is.na(data$Latitude))) {
    #     return(data)
    # }
    # this finds the name of the longitude coordinate from my list that matches lon/long/whatever to Longitude
    nc180 <- ncIs180(nc)
    data180 <- dataIs180(data)

    if(nc180 != data180) {
        data <- to180(data, inverse = !nc180)
    }
    # for each variable, make the ncvar_get call which needs start and count#
    # these are XYZT if Z is present, -1 count means get all
    # sometimes coords are stored as variables, drop these
    dropVar <- c('latitude', 'longitude', 'month', 'day', 'year', 'lon', 'lat')
    # also drop anything with no dimensions - we can't match them anyway and they cause errors
    nDim <- lapply(nc$var, function(v) v$ndim)
    dropVar <- c(dropVar, names(nc$var)[which(nDim == 0)])
    varNames <- names(nc$var)[!(names(nc$var) %in% dropVar)]
    if(!is.null(var)) {
        hasVar <- var %in% varNames
        if(!any(hasVar)) {
            stop('None of the desired variables (',
                 paste0(var, collapse=','),
                 ') were present in the NC file.')
        }
        if(!all(hasVar)) {
            warning('Some of the desired variables (',
                    paste0(var[!hasVar], collapse=','),
                    ') were missing from the NC file.')
        }
        varNames <- var[hasVar]
    }
    usedDim <- unique(unlist(sapply(nc$var, function(x) x$dimids+1)))
    usedDim <- usedDim[!is.na(usedDim)]
    names(nc$dim)[usedDim] <- standardCoordNames(names(nc$dim)[usedDim])

    if('Depth' %in% names(nc$dim)) {
        matchDims <- c('matchLong', 'matchLat', 'matchTime', 'matchDepth')
    } else {
        matchDims <- c('matchLong', 'matchLat', 'matchTime')
    }
    allVar <- vector('list', length = length(varNames) + length(matchDims))
    names(allVar) <- c(varNames, matchDims)
    for(v in seq_along(allVar)) {
        allVar[[v]] <- vector('list', length = nrow(data))
    }
    reqDims <- names(nc$dim)[names(nc$dim) %in% c('Longitude', 'Latitude', 'UTC')]
    if(!all(reqDims %in% names(data))) {
        stop('data must have columns ', paste0(reqDims, collapse=', '))
    }
    if('UTC' %in% names(nc$dim)) {
        nc$dim$UTC$vals <- ncTimeToPosix(nc$dim$UTC)
    }
    # nc$dim$Longitude$vals <- round(nc$dim$Longitude$vals, 5)
    # nc$dim$Latitude$vals <- round(nc$dim$Latitude$vals, 5)
    if(progress) {
        cat('Matching data...\n')
        pb <- txtProgressBar(min=0, max=nrow(data), style=3)
    }
    for(v in varNames) {
        names(nc$var[[v]]$dim) <- names(nc$dim)[nc$var[[v]]$dimids + 1]
    }
    possDims <- c('UTC', 'Latitude', 'Longitude', 'Depth')
    hasDims <- names(data)[names(data) %in% possDims]
    for(i in 1:nrow(data)) {
        varData <- getVarData(data[hasDims][i,], nc=nc, var=varNames, buffer = buffer,
                              depth=depth, verbose=verbose)
        for(v in varNames) {
            allVar[[v]][[i]] <- varData[[v]]
        }
        for(c in matchDims) {
            allVar[[c]][[i]] <- varData[[c]]
        }
        if(progress) {
            setTxtProgressBar(pb, value=i)
        }
    }
    if(raw) {
        return(transpose(allVar))
    }

    # if its a single function make a list for looping
    if(is.list(FUN) &&
       is.null(names(FUN))) {
        names(FUN) <- as.character(substitute(FUN))[-1]
    } else if(is.function(FUN)) {
        tmpName <- as.character(substitute(FUN))
        FUN <- list(FUN)
        names(FUN) <- tmpName
    }
    fnames <- names(FUN)

    for(v in varNames) {
        for(f in seq_along(FUN)) {
            if(fnames[f] %in% c('mean', 'median', 'sd')) {
                # FUN[[f]] <- function(x) FUN[[f]](x, na.rm=TRUE)
                data[[paste0(v, '_', fnames[f])]] <- sapply(allVar[[v]], function(x) FUN[[f]](x, na.rm=TRUE))
            } else {
                data[[paste0(v, '_', fnames[f])]] <- sapply(allVar[[v]], FUN[[f]])
            }
        }
    }
    if(keepMatch) {
        for(c in matchDims) {
            data[[paste0(c, '_mean')]] <- sapply(allVar[[c]], function(x) mean(x, na.rm=TRUE))
        }
        if(!is.null(data$matchTime_mean) &&
           !(all(is.na(data$matchTime_mean))) &&
           max(abs(data$matchTime_mean), na.rm=TRUE) > 365) {
            data$matchTime_mean <- as.POSIXct(data$matchTime_mean, origin = '1970-01-01 00:00:00', tz='UTC')
        }
    }
    data <- to180(data, inverse=!data180)
    # change back to whatever they were using
    colnames(data)[1:length(oldNames)] <- oldNames
    data
}

getVarData <- function(data, nc, var, buffer, depth=NULL, verbose=TRUE) {
    xIx <- dimToIx(data$Longitude, nc$dim$Longitude, buffer[1], verbose)
    yIx <- dimToIx(data$Latitude, nc$dim$Latitude, buffer[2], verbose)
    hasT <- 'UTC' %in% names(nc$dim)
    multiT <- sum(names(nc$dim) == 'UTC') > 1
    if(hasT &&
       !multiT) {
        tIx <- dimToIx(data$UTC, nc$dim$UTC, buffer[3], verbose)
        tVals <- ncTimeToPosix(nc$dim$UTC$vals[tIx$ix], units = nc$dim$UTC$units)
    }
    if(!hasT) {
        tVals <- NA
    }
    hasZ <- 'Depth' %in% names(nc$dim)
    if(hasZ) {
        if(is.null(depth)) {
            if('Depth' %in% colnames(data)) {
                # no buffer for depth
                zIx <- dimToIx(data$Depth, nc$dim$Depth, 0, verbose=FALSE)
                zVals <- nc$dim$Depth$vals[zIx$ix]
            } else {
                zIx <- list(start=1, count=-1)
                zVals <- nc$dim$Depth$vals
            }
        } else {
            zIx <- dimToIx(depth, nc$dim$Depth, 0, verbose=FALSE)
            zVals <- nc$dim$Depth$vals[zIx$ix]
        }
    }
    result <- vector('list', length = length(var) + 3 + hasZ)
    outNames <- c(var, 'matchLong', 'matchLat', 'matchTime')
    if(hasZ) {
        outNames <- c(outNames, 'matchDepth')
    }
    names(result) <- outNames
    checkNa <- c(xIx$start, xIx$count, yIx$start, yIx$count)
    if(hasT &&
       !multiT) {
        checkNa <- c(checkNa, tIx$start, tIx$count)
    }
    if(hasZ) {
        checkNa <- c(checkNa, zIx$start, zIx$count)
    }
    if(any(is.na(checkNa))) {
        for(r in seq_along(result)) {
            result[[r]] <- NA
        }
        return(result)
    }
    for(v in var) {
        thisVar <- nc[['var']][[v]]
        thisDimId <- thisVar$dimids+1
        thisDim <- names(nc$dim)[thisDimId]
        start <- numeric(0)
        count <- numeric(0)
        # build start/count vectors. Typically in XYZT order, but occasionally there
        # are differences. So go in order of dimids (which is 0 indexed so +1)
        for(d in seq_along(thisDim)) {
            switch(thisDim[d],
                   'Longitude' = {
                       start <- c(start, xIx$start)
                       count <- c(count, xIx$count)
                   },
                   'Latitude' = {
                       start <- c(start, yIx$start)
                       count <- c(count, yIx$count)
                   },
                   'Depth' = {
                       start <- c(start, zIx$start)
                       count <- c(count, zIx$count)
                   },
                   'UTC' = {
                       if(multiT) {
                           tIx <- dimToIx(data$UTC, nc$dim[[thisDimId[d]]], buffer[3], verbose)
                           tVals <- ncTimeToPosix(nc$dim[[thisDimId[d]]]$vals[tIx$ix],
                                                  units = nc$dim[[thisDimId[d]]]$units)
                           if(is.na(tIx$start) || is.na(tIx$count)) {
                               for(r in seq_along(result)) {
                                   result[[r]] <- NA
                               }
                               return(result)
                           }
                       }
                       start <- c(start, tIx$start)
                       count <- c(count, tIx$count)
                   },
                   # if random other variable, only option is to read all of it
                   {
                       start <- c(start, 1)
                       count <- c(count, -1)
                   }
            )
        }
        result[[v]] <- ncvar_get(nc, varid=v, start=start, count=count)
    }
    result$matchLong <- nc$dim$Longitude$vals[xIx$ix]
    result$matchLat <- nc$dim$Latitude$vals[yIx$ix]
    result$matchTime <- tVals
    if(hasZ) {
        result$matchDepth <- zVals
    }
    result
}
# checking for very specific tabledap style data where corods are all in vars
# and we can only deal with stationary versions of this
isStationary <- function(nc) {
    names(nc$dim) <- standardCoordNames(names(nc$dim))
    if(all(c('UTC', 'Latitude', 'Longitude') %in%
           names(nc$dim))) {
        return(FALSE)
    }
    newVar <- standardCoordNames(names(nc$var))
    if(!'UTC' %in% c(names(nc$dim), newVar)) {
        # stop('No time data found in this file')
        return(FALSE)
    }
    if('Longitude' %in% newVar) {
        vals <- ncvar_get(nc, varid=names(nc$var)[newVar == 'Longitude'])
        if(diff(range(vals)) > 1e-4) {
            return(FALSE)
        }
    }
    if('Latitude' %in% newVar) {
        vals <- ncvar_get(nc, varid=names(nc$var)[newVar == 'Latitude'])
        if(diff(range(vals)) > 1e-4) {
            return(FALSE)
        }
    }
    TRUE
}

stationaryNcToData <- function(data, nc, var=NULL, buffer = c(0,0,0), FUN = c(mean),
                     raw = FALSE, keepMatch=TRUE, progress=TRUE, depth=0, verbose=TRUE, ...) {
    oldNames <- colnames(data)
    colnames(data) <- standardCoordNames(oldNames)
    newVar <- standardCoordNames(names(nc$var))
    names(nc$dim) <- standardCoordNames(names(nc$dim))
    if(!'UTC' %in% names(nc$dim)) {
        nc$dim$UTC <- list(
            vals=ncvar_get(nc, varid=names(nc$var)[newVar == 'UTC']),
            units=nc$var[[which(newVar == 'UTC')]]$units,
            name='UTC'
        )
        nc$dim$UTC$len <- length(nc$dim$UTC$vals)
    }
    if(isFALSE(keepMatch)) {
        warning('For stationary environmental sources it is recommended to use',
                ' "keepMatch=TRUE" since matched distances may be far')
    }
    # also drop anything with no dimensions - we can't match them anyway and they cause errors
    nDim <- lapply(nc$var, function(v) v$ndim)
    dropVar <- names(nc$var)[newVar %in% c('Latitude', 'Longitude', 'Depth', 'UTC')]
    dropVar <- c(dropVar, names(nc$var)[which(nDim == 0)])
    varNames <- names(nc$var)[!(names(nc$var) %in% dropVar)]
    if(!is.null(var)) {
        hasVar <- var %in% varNames
        if(!any(hasVar)) {
            stop('None of the desired variables (',
                 paste0(var, collapse=','),
                 ') were present in the NC file.')
        }
        if(!all(hasVar)) {
            warning('Some of the desired variables (',
                    paste0(var[!hasVar], collapse=','),
                    ') were missing from the NC file.')
        }
        varNames <- var[hasVar]
    }
    matchDims <- c('matchLong', 'matchLat', 'matchTime', 'matchDepth')[
        c('Longitude', 'Latitude', 'UTC', 'Depth') %in% names(data)
    ]
    
    allVar <- vector('list', length = length(varNames) + length(matchDims))
    names(allVar) <- c(varNames, matchDims)
    for(v in seq_along(allVar)) {
        allVar[[v]] <- vector('list', length = nrow(data))
    }

    if('UTC' %in% names(nc$dim)) {
        nc$dim$UTC$vals <- ncTimeToPosix(nc$dim$UTC)
    }
    if(progress) {
        cat('Matching data...\n')
        pb <- txtProgressBar(min=0, max=nrow(data), style=3)
    }

    possDims <- c('UTC', 'Latitude', 'Longitude', 'Depth')
    hasDims <- names(data)[names(data) %in% possDims]
    
    matchList <- vector('list', length=length(matchDims))
    names(matchList) <- matchDims
    if('Longitude' %in% newVar) {
        matchList[['matchLong']] <- ncvar_get(nc, 
                                              varid=names(nc$var)[newVar == 'Longitude'],
                                              start=1, 
                                              count=1)
    } else {
        matchList[['matchLong']] <- NA
    }
    if('Latitude' %in% newVar) {
        matchList[['matchLat']]<- ncvar_get(nc, 
                                            varid=names(nc$var)[newVar == 'Latitude'],
                                            start=1, 
                                            count=1)
    } else {
        matchList[['matchLat']] <- NA
    }
    if('Depth' %in% newVar) {
        matchList[['matchDepth']]<- ncvar_get(nc, 
                                              varid=names(nc$var)[newVar == 'Depth'],
                                              start=1, 
                                              count=1)
    } else {
        matchList[['matchDepth']] <- NA
    }
    for(i in 1:nrow(data)) {
        tIx <- dimToIx(data$UTC[i], nc$dim$UTC, buffer[3], verbose)
        matchList[['matchTime']] <- nc$dim$UTC$vals[tIx$ix]
        for(v in varNames) {
            allVar[[v]][[i]] <- ncvar_get(nc, varid=v, start=tIx$start, count=tIx$count)
        }
        for(c in matchDims) {
            allVar[[c]][[i]] <- matchList[[c]]
        }
        if(progress) {
            setTxtProgressBar(pb, value=i)
        }
    }
    if(raw) {
        return(transpose(allVar))
    }
    
    # if its a single function make a list for looping
    if(is.list(FUN) &&
       is.null(names(FUN))) {
        names(FUN) <- as.character(substitute(FUN, env=parent.frame()))[-1]
    } else if(is.function(FUN)) {
        tmpName <- as.character(substitute(FUN, env=parent.frame()))
        FUN <- list(FUN)
        names(FUN) <- tmpName
    }
    fnames <- names(FUN)
    
    for(v in varNames) {
        for(f in seq_along(FUN)) {
            if(fnames[f] %in% c('mean', 'median', 'sd')) {
                # FUN[[f]] <- function(x) FUN[[f]](x, na.rm=TRUE)
                data[[paste0(v, '_', fnames[f])]] <- sapply(allVar[[v]], function(x) FUN[[f]](x, na.rm=TRUE))
            } else {
                data[[paste0(v, '_', fnames[f])]] <- sapply(allVar[[v]], FUN[[f]])
            }
        }
    }
    if(keepMatch) {
        for(c in matchDims) {
            data[[paste0(c, '_mean')]] <- sapply(allVar[[c]], function(x) mean(x, na.rm=TRUE))
        }
        if(!is.null(data$matchTime_mean) &&
           !(all(is.na(data$matchTime_mean))) &&
           max(abs(data$matchTime_mean), na.rm=TRUE) > 365) {
            data$matchTime_mean <- as.POSIXct(data$matchTime_mean, origin = '1970-01-01 00:00:00', tz='UTC')
        }
    }
    # data <- to180(data, inverse=!data180)
    # change back to whatever they were using
    colnames(data)[1:length(oldNames)] <- oldNames
    data
    
}