R/modulationSpectrum.R
In soundgen: Sound Synthesis and Acoustic Analysis

Documented in modulationSpectrum .modulationSpectrum modulationSpectrumFragment

### MODULATION SPECTRUM

#' Modulation spectrum
#'
#' Produces a modulation spectrum of waveform(s) or audio file(s). It begins
#' with some spectrogram-like time-frequency representation and analyzes the
#' modulation of the envelope in each frequency band. if \code{specSource =
#' 'audSpec'}, the sound is passed through a bank of bandpass filters with
#' \code{\link{audSpectrogram}}. If \code{specSource = 'STFT'}, we begin with an
#' ordinary spectrogram produced with a Short-Time Fourier Transform. If
#' \code{msType = '2D'}, the modulation spectrum is a 2D Fourier transform of
#' the spectrogram-like representation, with temporal modulation along the X
#' axis and spectral modulation along the Y axis. A good visual analogy is
#' decomposing the spectrogram into a sum of ripples of various frequencies and
#' directions. If \code{msType = '1D'}, the modulation spectrum is a matrix
#' containing 1D Fourier transforms of each frequency band in the spectrogram,
#' so the result again has modulation frequencies along the X axis, but the Y
#' axis now shows the frequency of each analyzed band. Roughness is calculated
#' as the proportion of the modulation spectrum within \code{roughRange} of
#' temporal modulation frequencies or some weighted version thereof. The
#' frequency of amplitude modulation (amMsFreq, Hz) is calculated as the highest
#' peak in the smoothed AM function, and its purity (amMsPurity, dB) as the
#' ratio of this peak to the median AM over \code{amRange}. For relatively short
#' and steady sounds, set \code{amRes = NULL} and analyze the entire sound. For
#' longer sounds and when roughness or AM vary over time, set \code{amRes} to
#' get multiple measurements over time (see examples). For multiple inputs, such
#' as a list of waveforms or path to a folder with audio files, the ensemble of
#' modulation spectra can be interpolated to the same spectral and temporal
#' resolution and averaged (if \code{averageMS = TRUE}).
#'
#' @seealso \code{\link{plotMS}} \code{\link{spectrogram}}
#'   \code{\link{audSpectrogram}} \code{\link{analyze}}
#'
#' @references \itemize{
#'   \item Singh, N. C., & Theunissen, F. E. (2003). Modulation spectra of
#'   natural sounds and ethological theories of auditory processing. The Journal
#'   of the Acoustical Society of America, 114(6), 3394-3411.
#' }
#' @inheritParams spectrogram
#' @inheritParams analyze
#' @param msType '2D' = two-dimensional Fourier transform of a spectrogram; '1D'
#'   = separately calculated spectrum of each frequency band
#' @param specSource 'STFT' = Short-Time Fourier Transform; 'audSpec' = a bank
#'   of bandpass filters (see \code{\link{audSpectrogram}})
#' @param windowLength,step,wn,zp parameters for extracting a spectrogram if
#'   \code{specType = 'STFT'}. Window length and step are specified in ms (see
#'   \code{\link{spectrogram}}). If \code{specType = 'audSpec'}, these settings
#'   have no effect
#' @param audSpec_pars parameters for extracting an auditory spectrogram if
#'   \code{specType = 'audSpec'}. If \code{specType = 'STFT'}, these settings
#'   have no effect
#' @param roughRange the range of temporal modulation frequencies that
#'   constitute the "roughness" zone, Hz
#' @param roughMean,roughSD the mean (Hz) and standard deviation (semitones) of
#'   a lognormal distribution used to weight roughness estimates. If either is
#'   null, roughness is calculated simply as the proportion of spectrum within
#'   \code{roughRange}. If both \code{roughMean} and \code{roughRange} are
#'   defined, weights outside \code{roughRange} are set to 0; a very large SD (a
#'   flat weighting function) gives the same result as just \code{roughRange}
#'   without any weighting (see examples)
#' @param roughMinFreq frequencies below roughMinFreq (Hz) are ignored when
#'   calculating roughness (ie the estimated roughness increases if we disregard
#'   very low-frequency modulation, which is often strong)
#' @param amRange the range of temporal modulation frequencies that we are
#'   interested in as "amplitude modulation" (AM), Hz
#' @param amRes target resolution of amplitude modulation, Hz. If \code{NULL},
#'   the entire sound is analyzed at once, resulting in a single roughness value
#'   (unless it is longer than \code{maxDur}, in which case it is analyzed in
#'   chunks \code{maxDur} s long). If \code{amRes} is set, roughness is
#'   calculated for windows \code{~1000/amRes} ms long (but at least 3 STFT
#'   frames). \code{amRes} also affects the amount of smoothing when calculating
#'   \code{amMsFreq} and \code{amMsPurity}
#' @param maxDur sounds longer than \code{maxDur} s are split into fragments,
#'   and the modulation spectra of all fragments are averaged
#' @param specMethod the function to call when calculating the spectrum of each
#'   frequency band (only used when \code{msType = '1D'}); 'meanspec' is faster
#'   and less noisy, whereas 'spec' produces higher resolution
#' @param logSpec if TRUE, the spectrogram is log-transformed prior to taking 2D
#'   FFT
#' @param logMPS if TRUE, the modulation spectrum is log-transformed prior to
#'   calculating roughness
#' @param power raise modulation spectrum to this power (eg power = 2 for ^2, or
#'   "power spectrum")
#' @param normalize if TRUE, the modulation spectrum of each analyzed fragment
#'   \code{maxDur} in duration is separately normalized to have max = 1
#' @param returnMS if FALSE, only roughness is returned (much faster). Careful
#'   with exporting the modulation spectra of a lot of sounds at once as this
#'   requires a lot of RAM
#' @param returnComplex if TRUE, returns a complex modulation spectrum (without
#'   normalization and warping)
#' @param averageMS if TRUE, the modulation spectra of all inputs are averaged
#'   into a single output; if FALSE, a separate MS is returned for each input
#' @param plot if TRUE, plots the modulation spectrum of each sound (see
#'   \code{\link{plotMS}})
#' @param savePlots if a valid path is specified, a plot is saved in this folder
#'   (defaults to NA)
#' @param logWarpX,logWarpY numeric vector of length 2: c(sigma, base) of
#'   pseudolog-warping the modulation spectrum, as in function
#'   pseudo_log_trans() from the "scales" package
#' @param quantiles labeled contour values, \% (e.g., "50" marks regions that
#'   contain 50\% of the sum total of the entire modulation spectrum)
#' @param kernelSize the size of Gaussian kernel used for smoothing (1 = no
#'   smoothing)
#' @param kernelSD the SD of Gaussian kernel used for smoothing, relative to its
#' size
#' @param xlab,ylab,main,xlim,ylim graphical parameters
#' @param width,height,units,res parameters passed to
#'   \code{\link[grDevices]{png}} if the plot is saved
#' @param ... other graphical parameters passed on to \code{filled.contour.mod}
#'   and \code{\link[graphics]{contour}} (see \code{\link{spectrogram}})
#' @return Returns a list with the following components:
#' \itemize{
#' \item \code{$original} modulation spectrum prior to blurring and log-warping,
#' but after squaring if \code{power = TRUE}, a matrix of nonnegative values.
#' Colnames are temporal modulation frequencies (Hz). Rownames are spectral
#' modulation frequencies (cycles/kHz) if \code{msType = '2D'} and frequencies
#' of filters or spectrograms bands (kHz) if \code{msType = '1D'}.
#' \item \code{$original_list} a list of modulation spectra for each analyzed
#' fragment (is \code{amRes} is not NULL)
#' \item \code{$processed} modulation spectrum after blurring and log-warping
#' \item \code{$complex} untransformed complex modulation spectrum (returned
#' only if returnComplex = TRUE)
#' \item \code{$roughness} proportion of the modulation spectrum within
#' \code{roughRange} of temporal modulation frequencies or a weighted average
#' thereof if \code{roughMean} and \code{roughSD} are defined, \% - a vector if
#' amRes is numeric and the sound is long enough, otherwise a single number
#' \item \code{$roughness_list} a list containing frequencies, amplitudes, and
#' roughness values for each analyzed frequency band (1D) or frequency
#' modulation band (2D)
#' \item \code{$amMsFreq} frequency of the highest peak, within \code{amRange}, of
#' the folded AM function (average AM across all FM bins for both negative and
#' positive AM frequencies), where a peak is a local maximum over \code{amRes}
#' Hz. Like \code{roughness}, \code{amMsFreq} and \code{amMsPurity} can be single
#' numbers or vectors, depending on whether the sound is analyzed as a whole or
#' in chunks
#' \item \code{$amMsPurity} ratio of the peak at amMsFreq to the median AM over
#' \code{amRange}, dB
#' \item \code{$summary} dataframe with summaries of roughness, amMsFreq, and
#' amMsPurity
#' }
#' @export
#' @examples
#' # White noise
#' ms = modulationSpectrum(rnorm(16000), samplingRate = 16000,
#'   logSpec = FALSE, power = TRUE,
#'   amRes = NULL)  # analyze the entire sound, giving a single roughness value
#' str(ms)
#'
#' # Harmonic sound
#' s = soundgen(pitch = 440, amFreq = 100, amDep = 50)
#' ms = modulationSpectrum(s, samplingRate = 16000, amRes = NULL)
#' ms[c('roughness', 'amMsFreq', 'amMsPurity')]  # a single value for each
#' ms1 = modulationSpectrum(s, samplingRate = 16000, amRes = 5)
#' ms1[c('roughness', 'amMsFreq', 'amMsPurity')]
#' # measured over time (low values of amRes mean more precision, so we analyze
#' # longer segments and get fewer values per sound)
#'
#' # Embellish
#' ms = modulationSpectrum(s, samplingRate = 16000, logMPS = TRUE,
#'   xlab = 'Temporal modulation, Hz', ylab = 'Spectral modulation, 1/kHz',
#'   colorTheme = 'matlab', main = 'Modulation spectrum', lty = 3)
#'
#' # 1D instead of 2D
#' modulationSpectrum(s, 16000, msType = '1D', quantiles = NULL,
#'   colorTheme = 'matlab')
#'
#' \dontrun{
#' # A long sound with varying AM and a bit of chaos at the end
#' s_long = soundgen(sylLen = 3500, pitch = c(250, 320, 280),
#'                   amFreq = c(30, 55), amDep = c(20, 60, 40),
#'                   jitterDep = c(0, 0, 2))
#' playme(s_long)
#' ms = modulationSpectrum(s_long, 16000)
#' # plot AM over time
#' plot(x = seq(1, 1500, length.out = length(ms$amMsFreq)), y = ms$amMsFreq,
#'      cex = 10^(ms$amMsPurity/20) * 10, xlab = 'Time, ms', ylab = 'AM frequency, Hz')
#' # plot roughness over time
#' spectrogram(s_long, 16000, ylim = c(0, 4),
#'   extraContour = list(ms$roughness / max(ms$roughness) * 4000, col = 'blue'))
#'
#' # As with spectrograms, there is a tradeoff in time-frequency resolution
#' s = soundgen(pitch = 500, amFreq = 50, amDep = 100, sylLen = 500,
#'              samplingRate = 44100, plot = TRUE)
#' # playme(s, samplingRate = 44100)
#' ms = modulationSpectrum(s, samplingRate = 44100,
#'   windowLength = 50, step = 50, amRes = NULL)  # poor temporal resolution
#' ms = modulationSpectrum(s, samplingRate = 44100,
#'   windowLength = 5, step = 1, amRes = NULL)  # poor frequency resolution
#' ms = modulationSpectrum(s, samplingRate = 44100,
#'   windowLength = 15, step = 3, amRes = NULL)  # a reasonable compromise
#'
#' # Start with an auditory spectrogram instead of STFT
#' modulationSpectrum(s, 44100, specSource = 'audSpec', xlim = c(-100, 100))
#' modulationSpectrum(s, 44100, specSource = 'audSpec',
#'   logWarpX = c(10, 2), xlim = c(-500, 500),
#'   audSpec_pars = list(nFilters = 32, filterType = 'gammatone', bandwidth = NULL))
#'
#' # customize the plot
#' ms = modulationSpectrum(s, samplingRate = 44100,
#'   windowLength = 15, overlap = 80, amRes = NULL,
#'   kernelSize = 17,  # more smoothing
#'   xlim = c(-70, 70), ylim = c(0, 4),  # zoom in on the central region
#'   quantiles = c(.25, .5, .8),  # customize contour lines
#'   col = rev(rainbow(100)),  # alternative palette
#'   logWarpX = c(10, 2),  # pseudo-log transform
#'   power = 2)                   # ^2
#' # Note the peaks at FM = 2/kHz (from "pitch = 500") and AM = 50 Hz (from
#' # "amFreq = 50")
#'
#' # Input can be a wav/mp3 file
#' ms = modulationSpectrum('~/Downloads/temp/16002_Faking_It_Large_clear.wav')
#'
#' # Input can be path to folder with audio files. Each file is processed
#' # separately, and the output can contain an MS per file...
#' ms1 = modulationSpectrum('~/Downloads/temp', kernelSize = 11,
#'                          plot = FALSE, averageMS = FALSE)
#' ms1$summary
#' names(ms1$original)  # a separate MS per file
#' # ...or a single MS can be calculated:
#' ms2 = modulationSpectrum('~/Downloads/temp', kernelSize = 11,
#'                          plot = FALSE, averageMS = TRUE)
#' plotMS(ms2$original)
#' ms2$summary
#'
#' # Input can also be a list of waveforms (numeric vectors)
#' ss = vector('list', 10)
#' for (i in seq_along(ss)) {
#'   ss[[i]] = soundgen(sylLen = runif(1, 100, 1000), temperature = .4,
#'     pitch = runif(3, 400, 600))
#' }
#' # lapply(ss, playme)
#' # MS of the first sound
#' ms1 = modulationSpectrum(ss[[1]], samplingRate = 16000, scale = 1)
#' # average MS of all 10 sounds
#' ms2 = modulationSpectrum(ss, samplingRate = 16000, scale = 1, averageMS = TRUE, plot = FALSE)
#' plotMS(ms2$original)
#'
#' # A sound with ~3 syllables per second and only downsweeps in F0 contour
#' s = soundgen(nSyl = 8, sylLen = 200, pauseLen = 100, pitch = c(300, 200))
#' # playme(s)
#' ms = modulationSpectrum(s, samplingRate = 16000, maxDur = .5,
#'   xlim = c(-25, 25), colorTheme = 'seewave',
#'   power = 2)
#' # note the asymmetry b/c of downsweeps
#'
#' # "power = 2" returns squared modulation spectrum - note that this affects
#' # the roughness measure!
#' ms$roughness
#' # compare:
#' modulationSpectrum(s, samplingRate = 16000, maxDur = .5,
#'   xlim = c(-25, 25), colorTheme = 'seewave',
#'   power = 1)$roughness  # much higher roughness
#'
#' # Plotting with or without log-warping the modulation spectrum:
#' ms = modulationSpectrum(soundgen(), samplingRate = 16000, plot = TRUE)
#' ms = modulationSpectrum(soundgen(), samplingRate = 16000,
#'   logWarpX = c(2, 2), plot = TRUE)
#'
#' # logWarp and kernelSize have no effect on roughness
#' # because it is calculated before these transforms:
#' modulationSpectrum(s, samplingRate = 16000, logWarpX = c(1, 10))$roughness
#' modulationSpectrum(s, samplingRate = 16000, logWarpX = NA)$roughness
#' modulationSpectrum(s, samplingRate = 16000, kernelSize = 17)$roughness
#'
#' # Log-transform the spectrogram prior to 2D FFT (affects roughness):
#' modulationSpectrum(s, samplingRate = 16000, logSpec = FALSE)$roughness
#' modulationSpectrum(s, samplingRate = 16000, logSpec = TRUE)$roughness
#'
#' # Use a lognormal weighting function to calculate roughness
#' # (instead of just % in roughRange)
#' modulationSpectrum(s, 16000, roughRange = NULL,
#'   roughMean = 75, roughSD = 3)$roughness
#' modulationSpectrum(s, 16000, roughRange = NULL,
#'   roughMean = 100, roughSD = 12)$roughness
#' # truncate weights outside roughRange
#' modulationSpectrum(s, 16000, roughRange = c(30, 150),
#'   roughMean = 100, roughSD = 1000)$roughness  # very large SD
#' modulationSpectrum(s, 16000, roughRange = c(30, 150),
#'   roughMean = NULL)$roughness  # same as above b/c SD --> Inf
#'
#' # Complex modulation spectrum with phase preserved
#' ms = modulationSpectrum(soundgen(), samplingRate = 16000,
#'                         returnComplex = TRUE)
#' plotMS(abs(ms$complex))  # note the symmetry
#' # compare:
#' plotMS(ms$original)
#' }
modulationSpectrum = function(
    x,
    samplingRate = NULL,
    scale = NULL,
    from = NULL,
    to = NULL,
    msType = c('1D', '2D')[2],
    specSource = c('STFT', 'audSpec')[1],
    windowLength = 15,
    step = 1,
    wn = 'hanning',
    zp = 0,
    audSpec_pars = list(filterType = 'butterworth',
                        nFilters = 32,
                        bandwidth = 1/24,
                        yScale = 'bark',
                        dynamicRange = 120),
    amRes = 5,
    maxDur = 5,
    specMethod = c('spec', 'meanspec')[2],
    logSpec = FALSE,
    logMPS = FALSE,
    power = 1,
    normalize = TRUE,
    roughRange = c(30, 150),
    roughMean = NULL,
    roughSD = NULL,
    roughMinFreq = 1,
    amRange = c(10, 200),
    returnMS = TRUE,
    returnComplex = FALSE,
    summaryFun = c('mean', 'median', 'sd'),
    averageMS = FALSE,
    reportEvery = NULL,
    cores = 1,
    plot = TRUE,
    savePlots = NULL,
    logWarpX = NULL,
    logWarpY = NULL,
    quantiles = c(.5, .8, .9),
    kernelSize = 5,
    kernelSD = .5,
    colorTheme = c('bw', 'seewave', 'heat.colors', '...')[1],
    col = NULL,
    main = NULL,
    xlab = 'Hz',
    ylab = NULL,
    xlim = NULL,
    ylim = NULL,
    width = 900,
    height = 500,
    units = 'px',
    res = NA,
    ...
) {
  # default labels for y-axis
  if (is.null(ylab)) {
    if (specSource == 'STFT') {
      ylab = ifelse(msType == '2D', '1/kHz', 'kHz')
    } else {
      if (msType == '2D') {
        ylab = if (is.null(audSpec_pars$yScale)) '' else paste0('1/', audSpec_pars$yScale)
      } else {
        ylab = 'kHz'
      }
    }
  }

  ## Prepare a list of arguments to pass to .modulationSpectrum()
  myPars = c(as.list(environment()), list(...))
  # exclude unnecessary args
  myPars = myPars[!names(myPars) %in% c(
    'x', 'samplingRate', 'scale', 'from', 'to', 'savePlots',
    'reportEvery', 'cores', 'summaryFun', 'averageMS', 'audSpec_pars')]
  myPars$audSpec_pars = audSpec_pars

  # analyze
  pa = processAudio(
    x,
    samplingRate = samplingRate,
    scale = scale,
    from = from,
    to = to,
    funToCall = '.modulationSpectrum',
    myPars = myPars,
    reportEvery = reportEvery,
    cores = cores,
    savePlots = savePlots
  )

  # htmlPlots
  if (!is.null(pa$input$savePlots) && pa$input$n > 1) {
    try(htmlPlots(pa$input, savePlots = savePlots, changesAudio = FALSE,
                  suffix = "MS", width = paste0(width, units)))
  }

  # prepare output
  if (!is.null(summaryFun) && any(!is.na(summaryFun))) {
    temp = vector('list', pa$input$n)
    for (i in 1:pa$input$n) {
      if (!pa$input$failed[i]) {
        temp[[i]] = summarizeAnalyze(
          data.frame(roughness = pa$result[[i]]$roughness,
                     amMsFreq = pa$result[[i]]$amMsFreq,
                     amMsPurity = pa$result[[i]]$amMsPurity),
          summaryFun = summaryFun,
          var_noSummary = NULL)
      }
    }
    idx_failed = which(pa$input$failed)
    idx_ok = which(!pa$input$failed)
    if (length(idx_failed) > 0) {
      idx_ok = which(!pa$input$failed)
      if (length(idx_ok) > 0) {
        filler = temp[[idx_ok[1]]] [1, ]
        filler[1, ] = NA
      } else {
        stop('Failed to analyze any input')
      }
      for (i in idx_failed) temp[[i]] = filler
    }
    mysum_all = cbind(data.frame(file = pa$input$filenames_base),
                      do.call('rbind', temp))
  } else {
    mysum_all = NULL
  }
  for (i in idx_failed) pa$result[[i]] = list(
    original = NULL, processed = NULL, complex = NULL,
    roughness = NULL, amMsFreq = NULL, amMsPurity = NULL
  )
  original = original_list = processed = complex = roughness = roughness_list =
    modulation_spectrogram = roughness_spectrogram =
    amMsFreq = amMsPurity = ampl = NULL
  # (otherwise note about no visible binding)
  out_prep = c('original', 'original_list', 'modulation_spectrogram',
               'processed', 'complex', 'roughness', 'roughness_spectrogram',
               'roughness_list', 'amMsFreq', 'amMsPurity', 'ampl')
  if (pa$input$n == 1) {
    # unlist
    for (op in out_prep) assign(noquote(op), pa$result[[1]] [[op]])
  } else {
    for (op in out_prep) assign(noquote(op), lapply(pa$result, function(x) x[[op]]))
  }

  # average MS across sounds
  op_prep2 = c('original', 'processed')  # roughness is not a matrix
  if (returnComplex) {
    op_prep2 = c(op_prep2, 'complex')
  } else {
    complex = NULL
  }
  if (averageMS & pa$input$n > 1) {
    for (op in op_prep2) {
      assign(
        noquote(op),
        averageMatrices(
          get(op) [idx_ok],
          rFun = 'max',   # normally same samplingRate, but if not, upsample frequency resolution
          cFun = 'median',   # typical ncol (depends on sound dur)
          reduceFun = '+')  # internally divides by length, so actually becomes mean
      )
    }
  }

  invisible(list(original = original,
                 original_list = original_list,
                 modulation_spectrogram = modulation_spectrogram,
                 processed = processed,
                 complex = complex,
                 roughness = roughness,
                 roughness_list = roughness_list,
                 roughness_spectrogram = roughness_spectrogram,
                 amMsFreq = amMsFreq,
                 amMsPurity = amMsPurity,
                 ampl = ampl,
                 summary = mysum_all))
}


#' Modulation spectrum per sound
#'
#' Internal soundgen function.
#' @inheritParams modulationSpectrum
#' @param audio a list returned by \code{readAudio}
#' @keywords internal
.modulationSpectrum = function(
    audio,
    specSource = c('STFT', 'audSpec')[1],
    windowLength = 15,
    step = 1,
    wn = 'hanning',
    zp = 0,
    audSpec_pars = list(filterType = 'butterworth',
                        nFilters = 32,
                        bandwidth = 1/24,
                        yScale = 'bark',
                        dynamicRange = 120),
    msType = c('1D', '2D')[2],
    amRes = 5,
    maxDur = 5,
    specMethod = c('spec', 'meanspec')[2],
    logSpec = FALSE,
    logMPS = FALSE,
    power = 1,
    normalize = TRUE,
    roughRange = c(30, 150),
    roughMean = NULL,
    roughSD = NULL,
    roughMinFreq = 1,
    amRange = c(10, 200),
    returnMS = TRUE,
    returnComplex = FALSE,
    plot = TRUE,
    savePlots = NULL,
    logWarpX = NULL,
    logWarpY = NULL,
    quantiles = c(.5, .8, .9),
    kernelSize = 5,
    kernelSD = .5,
    colorTheme = c('bw', 'seewave', 'heat.colors', '...')[1],
    col = NULL,
    main = NULL,
    xlab = 'Hz',
    ylab = '1/kHz',
    xlim = NULL,
    ylim = NULL,
    width = 900,
    height = 500,
    units = 'px',
    res = NA,
    ...
) {
  # Re-set windowLength, step, and overlap so as to ensure that
  # windowLength_points and step_points are not fractions
  if (is.null(step)) step = windowLength * (1 - overlap / 100)
  step_points = round(step / 1000 * audio$samplingRate)
  step = step_points / audio$samplingRate * 1000
  windowLength_points = round(windowLength / 1000 * audio$samplingRate)
  windowLength = windowLength_points / audio$samplingRate * 1000
  overlap = 100 * (1 - step_points / windowLength_points)
  lowestFreq = if (is.null(roughRange)) 5 else min(5, roughRange[1])

  max_am = 1000 / step / 2
  if (!is.null(roughRange) && max_am < roughRange[1]) {
    warning(paste(
      'roughRange outside the analyzed range of temporal modulation frequencies;',
      'increase overlap / decrease step to improve temporal resolution,',
      'or else look for roughness in a lower range'))
  }
  if (is.numeric(amRes)) {
    if (specSource == 'STFT') {
      nFrames = max(3, ceiling(max_am / amRes * 2))  # min 3 spectrograms frames
      # split the input sound into chunks nFrames long
      chunk_ms = windowLength + step * (nFrames - 1)
      if (amRes < (1/audio$duration)) {
        message(paste('The sound is too short to be analyzed with amRes =', amRes,
                      'Hz. Actual amRes ~= ', round(1/audio$duration, 2)))
        splitInto = 1
      } else {
        splitInto = max(1, ceiling(audio$duration * 1000 / chunk_ms))
      }
    } else {
      # split into as many chunks as needed (each chunk's dur = 1/amRes s)
      splitInto = max(1, round(audio$duration * amRes))
    }
  } else {
    # split only those sounds that exceed maxDur
    splitInto = max(1, ceiling(audio$duration / maxDur))
    # so, for ex., if 2.1 times longer than maxDur, split into three
  }

  if (splitInto > 1) {
    myseq = floor(seq(1, audio$ls, length.out = splitInto + 1))
    midpoints = myseq[1:splitInto] + (myseq[2] - myseq[1]) / 2
    myInput = vector('list', splitInto)
    for (i in 1:splitInto) {
      idx = myseq[i]:(myseq[i + 1])
      myInput[[i]] = audio$sound[idx]
    }
  } else {
    myInput = list(audio$sound)
  }

  # extract modulation spectrum per fragment
  ampl = rep(NA, splitInto)
  out = vector('list', splitInto)
  if (splitInto > 1) names(out) = midpoints / audio$ls * audio$dur
  if (returnComplex) {
    out_complex = out
  } else {
    out_aggreg_complex = NULL
  }
  for (i in 1:splitInto) {
    ms_i = modulationSpectrumFragment(
      myInput[[i]],
      samplingRate = audio$samplingRate,
      specSource = specSource,
      audSpec_pars = audSpec_pars,
      msType = msType,
      windowLength = windowLength,
      windowLength_points = windowLength_points,
      step = step,
      step_points = step_points,
      lowestFreq = lowestFreq,
      wn = wn,
      zp = zp,
      specMethod = specMethod,
      logSpec = logSpec,
      logMPS = logMPS,
      power = power,
      normalize = normalize)
    out[[i]] = ms_i$ms_half
    ampl[i] = ms_i$ampl

    if (returnComplex) {
      out_complex[[i]] = ms_i$ms_complex
    }
  }
  keep = which(unlist(lapply(out, function(x) !is.null(x))))
  out = out[keep]
  if (length(out) < 1) {
    warning('The sound is too short or windowLength too long. Need at least 3 frames')
    return(list('original' = NA,
                'original_list' = NA,
                'modulation_spectrogram' = NA,
                'processed' = NA,
                'complex' = NA,
                'roughness' = NA,
                'roughness_list' = NA,
                'roughness_spectrogram' = NA,
                'amMsFreq' = NA,
                'amMsPurity' = NA,
                'ampl' = NA))
  }

  # standardize the size of matrices for ease of processing
  if (length(out) > 1) {
    ncs = unlist(lapply(out, ncol))
    nc_max = max(ncs)
    for (i in seq_along(out)) {
      if (ncol(out[[i]]) != nc_max)
        out[[i]] = interpolMatrix(out[[i]], nc = nc_max)
    }
  }

  # concatenate freq-averaged modulation spectra into a modulation spectrogram
  if (length(out) < 2) {
    mg = NA
  } else {
    mg = do.call(cbind, lapply(out, colSums))
    colnames(mg) = as.numeric(names(out)) * 1000 # time in ms
    rownames(mg) = as.numeric(colnames(out[[1]])) / 1000 # modulation frequency in kHz
  }

  # extract a measure of roughness
  drop_any = !is.null(roughMinFreq) && is.finite(roughMinFreq) && roughMinFreq > 0
  if (drop_any) {
    am_drop = which(abs(as.numeric(colnames(out[[1]]))) < roughMinFreq)
    if (!length(am_drop) > 0) drop_any = FALSE
  }
  if (drop_any) {
    roughness_list = lapply(out, function(x)
      getRough(x[, -am_drop, drop = FALSE], roughRange = roughRange,
               roughMean = roughMean, roughSD = roughSD))
  } else {
    roughness_list = lapply(out, function(x)
      getRough(x, roughRange = roughRange,
               roughMean = roughMean, roughSD = roughSD))
  }
  roughness = unlist(lapply(roughness_list, function(x) sum(x$roughness, na.rm = TRUE)))

  # concatenate roughness per frequency bin over time into a roughness spectrogram
  rg = do.call(cbind, lapply(roughness_list, function(x) x$roughness))
  colnames(rg) = as.numeric(names(out)) * 1000  # time in ms
  rownames(rg) = rownames(out[[1]])  # frequency in kHz

  # detect systematic amplitude modulation at the same frequency
  am_list = lapply(out, function(x)
    getAM(x, amRange = amRange, amRes = amRes))
  amMsFreq = unlist(lapply(am_list, function(x) x$amMsFreq))
  amMsPurity = unlist(lapply(am_list, function(x) x$amMsPurity))

  # average modulation spectra across all sounds
  if (!returnMS) {
    result = list('original' = NULL,
                  'original_list' = NULL,
                  'modulation_spectrogram' = NULL,
                  'processed' = NULL,
                  'complex' = NULL,
                  'roughness' = roughness,
                  'roughness_list' = roughness_list,
                  'roughness_spectrogram' = NULL,
                  'amMsFreq' = amMsFreq,
                  'amMsPurity' = amMsPurity,
                  'ampl' = ampl / audio$scale)
  } else {
    out_aggreg = averageMatrices(
      out,
      rFun = 'max',   # normally same samplingRate, but if not, upsample frequency resolution
      cFun = 'median',  # typical ncol (depends on sound dur)
      reduceFun = '+')  # internally divides by length, so actually becomes mean
    X = as.numeric(colnames(out_aggreg))
    Y = as.numeric(rownames(out_aggreg))
    # image(X, Y, t(log(out_aggreg)))

    # prepare a separate summary of the complex ms
    if (returnComplex) {
      out_aggreg_complex = averageMatrices(
        out_complex,
        rFun = 'max',
        cFun = 'median',
        reduceFun = '+')
      # image(t(log(abs(out_aggreg_complex))))
    }

    # smoothing / blurring
    out_transf = gaussianSmooth2D(out_aggreg,
                                  kernelSize = kernelSize,
                                  kernelSD = kernelSD)
    result = list('original' = out_aggreg,
                  'original_list' = out,
                  'modulation_spectrogram' = mg,
                  'processed' = out_transf,
                  'complex' = out_aggreg_complex,
                  'roughness' = roughness,
                  'roughness_list' = roughness_list,
                  'roughness_spectrogram' = rg,
                  'amMsFreq' = amMsFreq,
                  'amMsPurity' = amMsPurity,
                  'ampl' = ampl / audio$scale)
  }  # end of if (returnMS)


  # PLOTTING
  if (is.character(audio$savePlots)) {
    plot = TRUE
    png(filename = paste0(audio$savePlots, audio$filename_noExt, "_MS.png"),
        width = width, height = height, units = units, res = res)
  }

  if (plot) {
    if (nrow(out_transf) > 1) {
      plotMS(ms = out_transf,
             X = X,
             Y = Y,
             audio = audio,
             colorTheme = colorTheme,
             col = col,
             logWarpX = logWarpX,
             logWarpY = logWarpY,
             xlab = xlab, ylab = ylab,
             main = main,
             xlim = xlim, ylim = ylim,
             quantiles = quantiles,
             extraY = (msType == '2D' && specSource == 'STFT'),
             ...)
    } else {
      plot(X, out_transf, type = 'l', xlab = xlab, ylab = ylab,
           main = main, xlim = xlim, ylim = ylim, ...)
    }
    if (is.character(audio$savePlots)) dev.off()
  }

  invisible(result)
}


#' Modulation spectrum per fragment
#'
#' Internal soundgen function.
#' @inheritParams modulationSpectrum
#' @param sound numeric vector
#' @keywords internal
#' @examples
#' s = soundgen(amFreq = 25, amDep = 100)
#' ms = soundgen:::modulationSpectrumFragment(s, 16000,
#'   windowLength = 50, windowLength_points = .05 * 16000,
#'   step = 5, step_points = .005 * 16000)
#' plotMS(ms$ms_half)
#' image(as.numeric(colnames(ms$ms_half)), as.numeric(rownames(ms$ms_half)),
#'       t(log(ms$ms_half)))
modulationSpectrumFragment = function(sound,
                                      samplingRate,
                                      specSource = 'STFT',
                                      audSpec_pars = NULL,
                                      msType = c('2D', '1D')[1],
                                      windowLength,
                                      windowLength_points,
                                      step,
                                      step_points,
                                      lowestFreq,
                                      wn = 'hanning',
                                      zp = 0,
                                      specMethod = c('spec', 'meanspec')[2],
                                      logSpec = FALSE,
                                      logMPS = FALSE,
                                      power = 1,
                                      normalize = TRUE) {
  if (specSource == 'audSpec') {
    if (!is.null(audSpec_pars$nFilters) && audSpec_pars$nFilters == 1) {
      # just take Hilbert envelope of the original sound
      s1 = matrix(Mod(seewave::hilbert(
        sound,
        f = samplingRate,  # not actually needed
        fftw = FALSE)), nrow = 1)
      msType = '1D'  # can't do 2D with a single filter
    } else {
      audSpec = do.call(audSpectrogram, c(audSpec_pars, list(
        x = sound, samplingRate = samplingRate, plot = FALSE)))
      s1 = t(do.call(cbind, audSpec$filterbank_env))
      rownames(s1) = rownames(audSpec$audSpec)
    }
  } else if (specSource == 'STFT') {
    # Calling stdft is ~80 times faster than going through spectrogram (!)
    step_seq = seq(1, length(sound) + 1 - windowLength_points, step_points)
    # print(length(step_seq))
    if (length(step_seq) < 3) return(NULL)
    s1 = seewave::stdft(
      wave = as.matrix(sound),
      wn = wn,
      wl = windowLength_points,
      f = samplingRate,
      zp = zp,
      step = step_seq,
      scale = FALSE,
      norm = FALSE,
      complex = FALSE
    )  # rows = freqs, cols = time
    rownames(s1) = seq(0, samplingRate / 2, length.out = nrow(s1))
  }
  # image(t(log(s1))); s1[1:3, 1:3]; dim(s1); range(s1)

  # log-transform amplitudes
  if (logSpec) {
    positives = which(s1 > 0)
    nonpositives = which(s1 <= 0)
    s1[positives] = log(s1[positives])
    if (length(positives) > 0 & length(nonpositives) > 0) {
      s1[nonpositives] = min(s1[positives])
    }
    s1 = s1 - min(s1) + 1e-16  # positive
    # image(t(s1))
  }

  # spectrogram to modulation spectrum
  if (msType == '2D') {
    # 2D FFT
    if (specSource == 'STFT') {
      ms_complex = specToMS(s1, windowLength = windowLength, step = step)
    } else {
      # audSpec - need to set frequency labels
      ms_complex = suppressMessages(
        specToMS(s1, windowLength = NULL, step = 1000/samplingRate))
      rownames(ms_complex) = seq(-1, 1, length.out = nrow(ms_complex))
    }
    if (logMPS) {
      ms = log(abs(ms_complex))
    } else {
      ms = abs(ms_complex)
    }
    symAxis = floor(nrow(ms) / 2) + 1
    # ms[(symAxis - 2) : (symAxis + 2), 1:2]
    ms_half = ms[symAxis:nrow(ms),, drop = FALSE]  # take only the upper half (always even)
  } else if (msType == '1D') {
    # 1D FFT
    sr_s1 = ifelse(specSource == 'STFT', 1000/step, samplingRate)
    ms_complex = NA
    ms_half = specToMS_1D(s1, windowLength = 1000 / lowestFreq,
                          samplingRate = sr_s1, method = specMethod)
    if (logMPS) ms_half = log(ms_half)
  }

  # power
  if (is.numeric(power) && power != 1) ms_half = ms_half ^ power

  # normalize
  if (normalize && diff(range(ms_half)) != 0) {
    ms_half = ms_half - min(ms_half)  # can be negative if logMPS = TRUE
    ms_half = ms_half / max(ms_half) # or sum(ms_half)
  }
  # image(x = as.numeric(colnames(ms_half)), z = t(log(ms_half)))
  # plotMS(log(ms_half + 1e-6), quantiles = NULL, colorTheme = 'matlab', logWarpX = c(10, 2))
  list(
    ms_half = ms_half,
    ms_complex = ms_complex,
    ampl = sqrt(mean(sound^2))
  )
}