R/summarize_reference.R

Defines functions summarize_reference

Documented in summarize_reference

#' @title Summarize temporal and frequency dimensions of annotations and gaps
#'
#' @description \code{summarize_reference} summarizes temporal and frequency dimensions of annotations and gaps
#' @usage summarize_reference(reference, path = NULL, by.sound.file = FALSE,
#' units = c("ms", "kHz"), digits = 2)
#' @param reference Data frame or 'selection.table' (following the warbleR package format) with the reference selections (start and end of the sound events) that will be used to evaluate the performance of the detection, represented by those selections in 'detection'. Must contained at least the following columns: "sound.files", "selec", "start" and "end". If frequency range columns are included ("bottom.freq" and "top.freq") these are also used to characterize reference selections.
#' @param path Character string containing the directory path where the sound files are located. If supplied then duty cycle and peak frequency features are returned. These features are more helpful for tuning a energy-based detection. Default is \code{NULL}.
#' @param by.sound.file Logical argument to control whether features are summarized across sound files (when \code{by.sound.file = FALSE}, and more than 1 sound file is included in 'reference') or shown separated by sound file. Default is \code{FALSE}.
#' @param units A character vector of length 2 with the units to be used for time and frequency parameters, in that order. Default is \code{c("ms", "kHz")}. It can also take 's' and 'Hz'.
#' @param digits Numeric vector of length 1 with the number of decimals to include. Default is 2.
#' @return The function returns the mean, minimum and maximum duration of selections and gaps (time intervals between selections) and of the number of annotations by sound file. If frequency range columns are included in the reference table (i.e. "bottom.freq" and "top.freq") the minimum bottom frequency ('min.bottom.freq') and the maximum top frequency ('max.top.freq') are also estimated. Finally, if the path to the sound files in 'reference' is supplied the duty cycle (fraction of a sound file corresponding to target sound events) and peak amplitude (highest amplitude in a detection) are also returned. If `by.sound.file = FALSE` a matrix with features in rows is returned. Otherwise a data frame is returned in which each row correspond to a sound file. By default, time features are returned in 'ms' while frequency features in 'kHz' (but see 'units' argument).
#' @export
#' @name summarize_reference
#' @details The function extracts quantitative features from reference tables that can inform the range of values to be used in a energy-based detection optimization routine. Features related to selection duration can be used to set the 'max.duration' and 'min.duration' values, frequency related features can inform bandpass values, gap related features inform hold time values and duty cycle can be used to evaluate performance.
#' @examples {
#'   # load data and save example files into temporary working directory
#'   data("lbh1", "lbh2", "lbh_reference")
#'   tuneR::writeWave(lbh1, file.path(tempdir(), "lbh1.wav"))
#'   tuneR::writeWave(lbh2, file.path(tempdir(), "lbh2.wav"))
#'
#'   # summary across sound files
#'   summarize_reference(reference = lbh_reference, path = tempdir())
#'
#'   # summary across sound files
#'   summarize_reference(reference = lbh_reference, by.sound.file = TRUE, path = tempdir())
#' }
#' @seealso \code{\link{optimize_energy_detector}}, \code{\link{optimize_template_detector}}
#' @author Marcelo Araya-Salas \email{marcelo.araya@@ucr.ac.cr})
#'
#' @references 
#' Araya-Salas, M., Smith-Vidaurre, G., Chaverri, G., Brenes, J. C., Chirino, F., Elizondo-Calvo, J., & Rico-Guevara, A. 2022. ohun: an R package for diagnosing and optimizing automatic sound event detection. BioRxiv, 2022.12.13.520253. https://doi.org/10.1101/2022.12.13.520253
#' 

summarize_reference <-
  function(reference,
           path = NULL,
           by.sound.file = FALSE,
           units = c("ms", "kHz"),
           digits = 2) {
    # check arguments
    arguments <- as.list(base::match.call())

    # add objects to argument names
    for (i in names(arguments)[-1]) {
      arguments[[i]] <- get(i)
    }

    # check each arguments
    check_results <- check_arguments(fun = arguments[[1]], args = arguments)

    # report errors
    checkmate::reportAssertions(check_results)

    # convert units to lower case
    units <- tolower(units)

    # get features
    internal_summarize_reference <-
      function(reference,
               path = NULL,
               total.annotations = FALSE) {
        reference$duration <- reference$end - reference$start
        reference <- warbleR::gaps(X = reference, pb = FALSE)
        count_annotations <- table(reference$sound.files)

        output <-
          data.frame(min.sel.duration = min(reference$duration, na.rm = TRUE))
        output$mean.sel.duration <-
          mean(reference$duration, na.rm = TRUE)
        output$max.sel.duration <- max(reference$duration, na.rm = TRUE)
        output$min.gap.duration <- if (any(!is.na(reference$gaps)))
          min(reference$gaps, na.rm = TRUE) else
            NA
        output$mean.gap.duration <- if (any(!is.na(reference$gaps)))
          mean(reference$gaps, na.rm = TRUE) else
            NA
        output$max.gap.duration <- if (any(!is.na(reference$gaps)))
          max(reference$gaps, na.rm = TRUE) else
            NA

        if (total.annotations) {
          output$annotations <- nrow(reference)
        } else {
          output$min.annotations <- min(count_annotations)
          output$mean.annotations <- mean(count_annotations)
          output$max.annotations <- max(count_annotations)
        }

        # frequency range descriptors
        if (!is.null(reference$bottom.freq) &
          !is.null(reference$top.freq)) {
          output$min.bottom.freq <- min(reference$bottom.freq, na.rm = TRUE)
          output$mean.bottom.freq <-
            mean(reference$bottom.freq, na.rm = TRUE)
          output$max.bottom.freq <-
            max(reference$bottom.freq, na.rm = TRUE)
          output$min.top.freq <- min(reference$top.freq, na.rm = TRUE)
          output$mean.top.freq <- mean(reference$top.freq, na.rm = TRUE)
          output$max.top.freq <- max(reference$top.freq, na.rm = TRUE)
        }

        if (!is.null(path)) {
          durs <-
            warbleR::duration_sound_files(
              files = unique(reference$sound.files),
              path = path
            )

          durs$duty.cycle <- vapply(seq_len(nrow(durs)), function(x) {
            sum(reference$duration[reference$sound.files == durs$sound.files[x]], na.rm = TRUE) / durs$duration[x]
          }, FUN.VALUE = numeric(1))

          output$min.duty.cycle <- min(durs$duty.cycle, na.rm = TRUE)
          output$mean.duty.cycle <- mean(durs$duty.cycle, na.rm = TRUE)
          output$max.duty.cycle <- max(durs$duty.cycle, na.rm = TRUE)

          # measure peak amplitude
          peak_amp <-
            warbleR::sound_pressure_level(reference,
              type = "peak",
              path = path,
              pb = FALSE
            )

          output$min.peak.amplitude <- min(peak_amp$SPL, na.rm = TRUE)
          output$mean.peak.amplitude <- mean(peak_amp$SPL, na.rm = TRUE)
          output$max.peak.amplitude <- max(peak_amp$SPL, na.rm = TRUE)
        }

        return(output)
      }

    # force by.sound.files if only 1 sound file in reference
    if (length(unique(reference$sound.files)) == 1) {
      by.sound.file <- TRUE
    }

    if (!by.sound.file) {
      output <-
        internal_summarize_reference(reference, path, total.annotations = FALSE)
    } else {
      output_list <- lapply(unique(reference$sound.files), function(x) {
        sub_output <-
          internal_summarize_reference(
            reference = reference[reference$sound.files == x, ],
            path,
            total.annotations = TRUE
          )
        sub_output$sound.files <- x
        return(sub_output)
      })

      output <- do.call(rbind, output_list)

      output$mean.gap.duration[is.infinite(output$mean.gap.duration) |
        is.nan(output$mean.gap.duration)] <- NA
      output$min.gap.duration[is.infinite(output$min.gap.duration) |
        is.nan(output$min.gap.duration)] <- NA

      # order columns
      output <- output[, c(ncol(output), 1:(ncol(output) - 1))]
    }

    # remove duty cycle range when by.sound.file = TRUE
    if (by.sound.file) {
      output$min.duty.cycle <- output$max.duty.cycle <- NULL
      names(output)[names(output) == "mean.duty.cycle"] <- "duty.cycle"
    } else {
      # reformat as a matrix with mean min and max as columns
      other_feats <-
        c(output[, grep("\\.freq$", names(output), invert = TRUE)])

      # order columns
      if (!is.null(output$min.bottom.freq)) {
        freq_feats <- c(output[, grep("\\.freq$", names(output))])
        other_feats <- c(other_feats, freq_feats)
      }
      output <- matrix(unlist(other_feats), ncol = 3, byrow = TRUE)
      colnames(output) <- c("min", "mean", "max")

      row_names <-
        c(
          "sel.duration",
          "gap.duration",
          "annotations",
          "duty.cycle",
          "peak.amplitude",
          "bottom.freq",
          "top.freq"
        )

      if (is.null(path)) {
        row_names <-
          grep("duty.cycle|peak.amplitude",
            row_names,
            value = TRUE,
            invert = TRUE
          )
      }

      rownames(output) <- row_names[seq_len(nrow(output))]
    }

    # round digits and change units
    if (is.matrix(output)) {
      # fix units
      if (units[1] == "ms") {
        for (i in grep("duration$", rownames(output))) {
          output[i, ] <- output[i, ] * 1000
        }
      }

      if (units[2] == "hz") {
        for (i in grep("freq$", rownames(output))) {
          output[i, ] <- output[i, ] * 1000
        }
      }

      # round
      output <- round(x = output, digits = digits)
    } else {
      # fix units
      if (units[1] == "ms") {
        for (i in grep("duration$", colnames(output))) {
          output[, i] <- output[, i] * 1000
        }
      }

      if (units[2] == "hz") {
        for (i in grep("freq$", colnames(output))) {
          output[, i] <- output[, i] * 1000
        }
      }

      for (i in 2:ncol(output)) {
        output[, i] <- round(x = output[, i], digits = digits)
      }
    }
    return(output)
  }



##############################################################################################################
#' alternative name for \code{\link{summarize_reference}}
#'
#' @keywords internal
#' @details see \code{\link{summarize_reference}} for documentation. \code{\link{feature_reference}} will be deprecated in future versions.
#' @export

feature_reference <- summarize_reference
maRce10/ohun documentation built on Oct. 25, 2024, 6:22 p.m.