R/plot_audio_summary.R

Defines functions plot_audio_summary

Documented in plot_audio_summary

# plot_audio_summary =================================================================

#' @name plot_audio_summary
#' @title Plot barcharts of daily and hourly total recordings across a park bioacoustics dataset.
#' @description Exploratory summary function that creates ggplots of total number of hours of audio collected by date, and total number of hours of audio collected by hour, across a yearly season for 1 or more locations in a park bioacoustic monitoring project. This function helps provide overviews to the user of when recordings were taken, and where gaps in the dataset may have occurred. Summaries may be generated either from a directory that points to raw audio files, or from an acoustic index file like that generated by \code{\link{nvspl_to_ai}}. Using the acoustic.index.fp is desired over using the audio.directory path, particularly for projects that have spanned many years and may have used varied audio recorder parameter settings. See Details.
#' @param audio.directory OPTIONAL. Path to audio files to be processed. Files are expected to have the naming convention SITEID_YYYYMMDD_HHMMSS. User must invoke either the audio.directory OR the acoustic.index.fp argument.
#' @param acoustic.index.fp OPTIONAL. Path an acoustic index file like that generated by \code{\link{nvspl_to_ai}}. User must invoke either the audio.directory OR the acoustic.index.fp argument.
#' @param max.x Maximum number of labels to include on the x axis of the date plot. Default = 20. Adjust as needed based on output.
#' @param nrow If plotting data that includes multiple locations and/or years, specify the number of rows the plot should have. Adjust as needed based on output.
#' @param ncol If plotting data that includes multiple locations and/or years, specify the number of columns the plot should have. Adjust as needed based on output.
#' @param tz.recorder If using audio.directory argument, specify the timezone setting used in the audio recorder using an \href{https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List}{Olson-names-formatted character timezone} for the location (e.g., 'UTC', 'America/Los_Angeles').
#' @param tz.local Specify an \href{https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List}{Olson-names-formatted character timezone} for the monitoring location.
#' @param sample.rate.hz If using audio.directory argument, specify the sample rate in Hz used by the audio recorder for this dataset. Default = 44100.
#' @param bit.depth If using audio.directory argument, specify the bit depth used by the audio recorder for this dataset. Default = 16.
#' @param bit.rate If using audio.directory argument, specify the bit rate used by the audio recorder for this dataset. Default = 1024.
#' @param channels If using audio.directory argument, specify the number of channels used by the audio recorder for this dataset. Default = 2.
#' @return Length 2 list of ggplot2 objects named plot.dates and plot.hours.
#' @details
#' This function was developed by the National Park Service Natural Sounds and Night Skies Division. It is intended to provide exploratory plotting that summarizes the total number of hours collected by date and by season in a park bioacoustics project.
#'
#' Using the acoustic.index.fp is desired over the audio.directory path. If computing audio hour totals directly from the audio.directory argument, totals are estimated based on the sample.rate.hz, bit.rate, bit.depth, and channels provided by the user or in the defaults for these arguments; thus, if an audio directory contains data with varying sample rates used over the years, the estimate provided from the audio directory may not be accurate. The function does NOT read every audio file to compute total values because this would be computationally prohibitive; instead, if using audio.directory, it assumes the user has done their due diligence inputting values for sample.rate.hz, bit.rate, bit.depth, and channels. If recordings occur over multiple hours and audio.directory is used, the plot.hours graphic will only display that data were collected over the hour specified in the file name, which may not give a correct estimate.
#'
#' @seealso  \code{\link{wave_to_nvspl}}, \code{\link{nvspl_to_ai}}
#' @import data.table ggplot2 lubridate
#' @importFrom ggplot2 ggplot
#' @importFrom lubridate year month day hour
#' @export
#' @examples
#' \dontrun{
#'
#'
#' ############################################
#' #
#' #  Example using acoustic.index.fp argument
#' #
#' ############################################
#'
#' # Read in example acoustic index data
#' data(exampleAI)
#'
#' # Write sample data to working directory
#' write.csv(x = exampleAI, file = 'ai.csv')
#'
#' gg <- plot_audio_summary(
#'  acoustic.index.fp = 'ai.csv',
#'  tz.local = 'America/Anchorage',
#'  max.x = 10,
#'  nrow = 4,
#'  ncol = 3
#'  )
#'
#' # View plots
#' gg$plot.dates
#' gg$plot.hours
#'
#'# Delete all temporary example files when finished
#' unlink(x = 'ai.csv')
#'
#' ############################################
#' #
#' #  Example using audio.directory argument
#' #
#' ############################################
#'
#'
#' # Create an input directory for this example
#' dir.create('example-input-directory')
#'
#' # Read in example wave files
#' data(exampleAudio1)
#' data(exampleAudio2)
#'
#' # Write example waves to example input directory
#' tuneR::writeWave(object = exampleAudio1,
#'                  filename = 'example-input-directory/Rivendell_20210623_113602.wav')
#' tuneR::writeWave(object = exampleAudio2,
#'                  filename = 'example-input-directory/Rivendell_20210623_114602.wav')
#'
#' gg <- plot_audio_summary(
#'  audio.directory = 'example-input-directory',
#'  tz.recorder = 'UTC',
#'  tz.local = 'America/Los_Angeles',
#'  sample.rate.hz = 22050,
#'  bit.depth = 16,
#'  bit.rate = 1024,
#'  channels = 2
#' )
#'
#' # View plots
#' gg$plot.dates
#' gg$plot.hours
#'
#' # Delete all temporary example files when finished
#' unlink(x = 'example-input-directory', recursive = TRUE)
#'
#' }

# FUTURE WORK: add option to compute from NVSPLs

plot_audio_summary <- function(audio.directory,
                               acoustic.index.fp,
                               max.x = 20, # number of labels to use on the x axis, date
                               nrow,
                               ncol,
                               tz.recorder,
                               tz.local,
                               sample.rate.hz = 44100,
                               bit.depth = 16,
                               bit.rate = 1024,
                               channels = 2) {


  if (!missing(audio.directory) & !(missing(acoustic.index.fp))) {
    stop('You have input both an audio.directory and the file path to an acoustic index csv (acoustic.index.fp). Please choose one or the other from which to compute the data summary. See ?plot_audio_summary for details.')
  }

  if (missing(audio.directory) & (missing(acoustic.index.fp))) {
    stop('To use this function, you need to either input an audio.directory OR the file path to an acoustic index csv (acoustic.index.fp). Please choose one or the other from which to compute the data summary. See ?plot_audio_summary for details.')
  }

  # If using acoustic index file path to compute summary, do this:
  if (missing(audio.directory) & !(missing(acoustic.index.fp))) {

    if (missing(tz.local)) {
      stop('If using acoustic.index.fp, please use tz.local argument to declare the local timezone at the monitoring site you are evaluating. See ?plot_audio_summary for details.')
    }

    ai <- fread(file = acoustic.index.fp)
    ai[,locationID := Site]
    rec.summary <- ai[,c('locationID', 'Date', 'Hr', 'Min', 'Sec', 'timestep', 'SampleLength_sec', 'timezone')]
    unique.timezones <- unique(rec.summary$timezone)

    # Add a dateTime object based on recorder time
    for (i in 1:length(unique.timezones)) {
      rec.summary[timezone == unique.timezones[i],
                  dateTimeRecorder := ymd_hms(
                    paste0(Date,' ', Hr, ':', Min, ':', Sec),
                    tz = unique.timezones[i])]

      # Create a UTC time column for easier conversion
      rec.summary[timezone == unique.timezones[i],
                  dateTimeUTC := with_tz(dateTimeRecorder, tzone = 'UTC')]

      # Create a local time column for downstream interpretability
      rec.summary[timezone == unique.timezones[i],
                  dateTimeLocal :=  with_tz(dateTimeUTC, tzone = tz.local)]

    }
    rec.summary[,dateTimeRecorder := NULL]

    rec.summary[,year := year(dateTimeLocal)][
      , hour.of.day := hour(dateTimeLocal)][
        ,date := as.Date(dateTimeLocal)]

    # Loop through however many years are in the dataset
    all.yrs <- sort(unique(rec.summary$year))
    hrs <- dates <- list()
    for (i in 1:length(all.yrs)) {

      rec.yr <- rec.summary[year == all.yrs[i]]

      # Compute total hours sampled by hour of day
      hrs[[i]] <- rec.yr[,sum(SampleLength_sec), by = c('locationID', 'hour.of.day')][
        ,Total.Hours := V1/3600][
          ,year := all.yrs[i]]

      # Compute total hours recorded by day of year
      dates[[i]] <- rec.yr[,sum(SampleLength_sec), by = c('locationID', 'date')][
        , Total.Hours := V1/3600][
          ,year := all.yrs[i]]
    }

    hrs <- rbindlist(hrs)
    dates <- rbindlist(dates)

  } # end if acoustic.index.fp


  # If using raw acoustic audio file path to compute summary, do this:
  if (!missing(audio.directory) & (missing(acoustic.index.fp))) {

    if (missing(tz.recorder) | missing(tz.local)) {
      stop('If using audio.directory, please use the tz.recorder and tz.local arguments to declare both the timezone setting used by the audio recorder (e.g., the timezone that would be reflected by the timestamp in the file names) and the local timezone used at the recording site. For example, this could look something like: tz.recorder = "UTC" and tz.local = "America/Los_Angeles" if the timezone used in the recorder settings was UTC, but the local timezone was in Pacific time. See ?plot_audio_summary for details.')
    }

    # Ensure forward slash at end ($) of audio directory
    if (grepl("\\/$", audio.directory) == FALSE) {
      audio.directory <- paste0(audio.directory, '/')
    }

    full.paths <- list.files(path = audio.directory,
                             recursive = TRUE,
                             pattern = '.wav|.WAV',
                             full.names = TRUE)

    # Query all file sizes, summarize alongside file names & save results
    fsize <- file.size(full.paths)
    summary.info <- data.table(path = full.paths)
    summary.info[,recordingID := basename(path)][,file.size := fsize]
    summary.info[,file.size.mb := file.size*0.000001]
    total.tb <- round(summary.info[,sum(file.size.mb)]*0.000001, 1)
    total.gb <- round(summary.info[,sum(file.size.mb)]*0.001, 0)

    rec.sum <- add_time_cols(summary.info, tz.recorder = tz.recorder, tz.local = tz.local)
    rec.sum[, date := as.Date(dateTimeLocal)][
      ,year := year(dateTimeLocal)][
        ,hour.of.day := hour(dateTimeLocal)]

    # Compute appx total hours based on sample.rate.hz, bit.rate, bit.depth, channels
    rec.sum[,bits := file.size * 8][
      ,sample.rate.hz := sample.rate.hz][
        ,bit.depth := bit.depth][
          ,bit.rate := bit.rate][
            ,channels := channels]

    rec.sum[,dur.mins := round(file.size / (sample.rate.hz*bit.depth*channels/8) / 60, 2)]

    # Compute total hours sampled by hour of day
    hrs <- rec.sum[,sum(dur.mins), by = c('locationID', 'year', 'hour.of.day')][
      ,Total.Hours := V1/60]

    # Compute total hours recorded by day of year
    dates <- rec.sum[,sum(dur.mins), by = c('locationID', 'year', 'date')][
      , Total.Hours := V1/60]

  } # end if using audio.directory

  # Prep to create human readable date labels on x axis
  dates[,julian.date := yday(date)][
    ,month := lubridate::month(date, label = TRUE)][
      ,day := day(date)]
  dates[,date.lab := paste0(day, '-', month)]
  date.range <- range(dates$julian.date)
  ttl.dates <- diff(date.range)
  jul.dates <- seq(
    from = date.range[1],
    to = date.range[2],
    by = round(ttl.dates/max.x, 0))
  brks <- unique(dates[julian.date %in% jul.dates,
                       c('julian.date', 'date.lab')])
  setkey(brks, julian.date)

  if (missing(nrow) & missing(ncol)) {
    locs.unq <- unique(dates$locationID)
    yrs.unq <- unique(dates$year)
    n.plots <- length(locs.unq) * length(yrs.unq)
    nrow <- length(locs.unq)
    ncol <- length(yrs.unq)
  }

  # Plot number of hours by date
  plot.dates <- ggplot(dates, aes(julian.date, Total.Hours)) +
    geom_bar(stat = 'identity', width = 1) +
    facet_wrap(locationID ~ year, nrow = nrow, ncol = ncol) +
    xlab('Date') +
    ylab('Total Number of Hours Recorded') +
    ggtitle('Distribution of Hours Recorded Across the Season') +
    scale_x_continuous(breaks = brks$julian.date, labels = brks$date.lab) +
    scale_y_continuous(expand = c(0, 0)) +
    theme(axis.ticks.y = element_blank(),
          #  text = element_text(family = 'Tahoma'),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(color = 'black'),
          plot.title = element_text(hjust = 0.5, size = 12),
          legend.text = element_text(size = 12),
          legend.title = element_blank(),
          strip.background = element_blank(),
          panel.border = element_rect(color = 'black', fill = NA),
          axis.title.x = element_text(size = 12),
          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 10))

  plot.hours <- ggplot(hrs, aes(factor(hour.of.day), Total.Hours)) +
    geom_bar(stat = 'identity', width = 1) +
    facet_wrap(locationID ~ year, nrow = nrow, ncol = ncol) +
    xlab('Hour of Day') + # in local time
    ylab('Total Number of Hours Recorded') +
    ggtitle('Distribution of Hours Recorded Across the Day') +
    scale_y_continuous(expand = c(0, 0)) +
    scale_x_discrete() +
    theme(axis.ticks.y = element_blank(),
          axis.line = element_line(color = 'black'),
          #  text = element_text(family = 'Tahoma'),
          plot.title = element_text(hjust = 0.5, size = 12),
          legend.text = element_text(size = 12),
          legend.title = element_blank(),
          strip.background = element_blank(),
          panel.border = element_rect(color = 'black', fill = NA),
          axis.title.x = element_text(size = 12))

  plots <- list(plot.dates = plot.dates, plot.hours = plot.hours)

  message('If you are having trouble outputting the plots, make sure to store this output to an object (e.g., g <- plot_audio_summary(your inputs here). Then you can access the two plots via g$plot.dates and g$plot.hours. See ?plot_audio_summary for examples.')

  return(plots)

}
nationalparkservice/NSNSDAcoustics documentation built on March 4, 2025, 10:24 p.m.