R/formants.R
In soundgen: Sound Synthesis and Acoustic Analysis

Documented in addFormants .addFormants getSpectralEnvelope transplantFormants

#' Spectral envelope
#'
#' Prepares a spectral envelope for filtering a sound to add formants, lip
#' radiation, and some stochastic component regulated by temperature. Formants
#' are specified as a list containing time, frequency, amplitude, and width
#' values for each formant (see examples). See vignette('sound_generation',
#' package = 'soundgen') for more information.
#' @param nr the number of frequency bins = windowLength_points/2, where
#'   windowLength_points is the size of window for Fourier transform
#' @param nc the number of time steps for Fourier transform
#' @inheritParams soundgen
#' @inheritParams getSmoothContour
#' @param formants a character string like "aaui" referring to default presets
#'   for speaker "M1"; a vector of formant frequencies; or a list of formant
#'   times, frequencies, amplitudes, and bandwidths, with a single value of each
#'   for static or multiple values of each for moving formants. \code{formants =
#'   NA} defaults to schwa. Time stamps for formants and mouthOpening can be
#'   specified in ms or an any other arbitrary scale.
#' @param formDrift scale factor regulating the effect of temperature on the
#'   depth of random drift of all formants (user-defined and stochastic): the
#'   higher, the more formants drift at a given temperature
#' @param formDisp scale factor regulating the effect of temperature on the
#'   irregularity of the dispersion of stochastic formants: the higher, the more
#'   unevenly stochastic formants are spaced at a given temperature
#' @param speedSound speed of sound in warm air, cm/s. Stevens (2000) "Acoustic
#'   phonetics", p. 138
#' @param openMouthBoost amplify the voice when the mouth is open by
#'   \code{openMouthBoost} dB
#' @param formantDepStoch multiplication factor for the amplitude of additional
#'   formants added above the highest specified formant (0 = none, 1 = default)
#' @param smoothLinearFactor regulates smoothing of formant anchors (0 to +Inf)
#'   as they are upsampled to the number of fft steps \code{nc}. This is
#'   necessary because the input \code{formants} normally contains fewer
#'   sets of formant values than the number of fft steps.
#'   \code{smoothLinearFactor} = 0: close to default spline; >3: approaches
#'   linear extrapolation
#' @param output "simple" returns just the spectral filter, while "detailed"
#'   also returns a data.frame of formant frequencies over time (needed for
#'   internal purposes such as formant locking)
#' @param plot if TRUE, produces a plot of the spectral envelope
#' @param duration duration of the sound, ms (for plotting purposes only)
#' @param colorTheme black and white ('bw'), as in seewave package ('seewave'),
#'   or another color theme (e.g. 'heat.colors')
#' @param col actual colors, eg rev(rainbow(100)) - see ?hcl.colors for colors
#'   in base R (overrides colorTheme)
#' @param xlab,ylab labels of axes
#' @param ... other graphical parameters passed on to \code{image()}
#' @export
#' @return Returns a spectral filter: a matrix with frequency bins in rows and
#'   time steps in columns. Accordingly, rownames of the output give central
#'   frequency of each bin (in kHz), while colnames give time stamps (in ms if
#'   duration is specified, otherwise 0 to 1).
#' @examples
#' # [a] with only F1-F3 visible, with no stochasticity
#' e = getSpectralEnvelope(nr = 512, nc = 50, duration = 300,
#'   formants = soundgen:::convertStringToFormants('a'),
#'   temperature = 0, plot = TRUE, col = heat.colors(150))
#' # image(t(e))  # to plot the output on a linear scale instead of dB
#'
#' # some "wiggling" of specified formants plus extra formants on top
#' e = getSpectralEnvelope(nr = 512, nc = 50,
#'   formants = c(860, 1430, 2900),
#'   temperature = 0.1, formantDepStoch = 1, plot = TRUE)
#'
#' # a schwa based on variable length of vocal tract
#' e = getSpectralEnvelope(nr = 512, nc = 100, formants = NA,
#'   vocalTract = list(time = c(0, .4, 1), value = c(13, 18, 17)),
#'   temperature = .1, plot = TRUE)
#'
#' # no formants at all, only lip radiation
#' e = getSpectralEnvelope(nr = 512, nc = 50, lipRad = 6,
#'   formants = NA, temperature = 0, plot = FALSE)
#' plot(e[, 1], type = 'l')              # linear scale
#' plot(20 * log10(e[, 1]), type = 'l')  # dB scale - 6 dB/oct
#'
#' # mouth opening
#' e = getSpectralEnvelope(nr = 512, nc = 50,
#'   vocalTract = 16, plot = TRUE, lipRad = 6, noseRad = 4,
#'   mouth = data.frame(time = c(0, .5, 1), value = c(0, 0, .5)))
#'
#' # scale formant amplitude and/or bandwidth
#' e1 = getSpectralEnvelope(nr = 512, nc = 50,
#'   formants = soundgen:::convertStringToFormants('a'),
#'   formantWidth = 1, formantDep = 1)  # defaults
#' e2 = getSpectralEnvelope(nr = 512, nc = 50,
#'   formants = soundgen:::convertStringToFormants('a'),
#'   formantWidth = 1.5, formantDep = 1.5)
#' plot(as.numeric(rownames(e2)), 20 * log10(e2[, 1]),
#'      type = 'l', xlab = 'KHz', ylab = 'dB', col = 'red', lty = 2)
#' points(as.numeric(rownames(e1)), 20 * log10(e1[, 1]), type = 'l')
#'
#' # manual specification of formants
#' e3 = getSpectralEnvelope(
#'   nr = 512, nc = 50, samplingRate = 16000, plot = TRUE,
#'   formants = list(
#'     f1 = list(freq = c(900, 500), amp = c(30, 35), width = c(80, 50)),
#'     f2 = list(freq = c(1900, 2500), amp = c(25, 30), width = 100),
#'     f3 = list(freq = 3400, amp = 30, width = 120)
#' ))
#'
#' # extra zero-pole pair (doesn't affect estimated VTL and thus the extra
#' # formants added on top)
#' e4 = getSpectralEnvelope(
#'   nr = 512, nc = 50, samplingRate = 16000, plot = TRUE,
#'   formants = list(
#'     f1 = list(freq = c(900, 500), amp = c(30, 35), width = c(80, 50)),
#'     f1.5 = list(freq = 1300, amp = -15),
#'     f1.7 = list(freq = 1500, amp = 15),
#'     f2 = list(freq = c(1900, 2500), amp = c(25, 30), width = 100),
#'     f3 = list(freq = 3400, amp = 30, width = 120)
#' ))
#' plot(as.numeric(rownames(e4)), 20 * log10(e3[, ncol(e3)]),
#'      type = 'l', xlab = 'KHz', ylab = 'dB')
#' points(as.numeric(rownames(e4)), 20 * log10(e4[, ncol(e4)]),
#'        type = 'l', col = 'red', lty = 2)
getSpectralEnvelope = function(
    nr,
    nc,
    formants = NA,
    formantDep = 1,
    formantWidth = 1,
    lipRad = 6,
    noseRad = 4,
    mouth = NA,
    mouthOpenThres = 0.2,
    openMouthBoost = 0,
    vocalTract = NULL,
    temperature = 0.05,
    formDrift = .3,
    formDisp = .2,
    formantDepStoch = 1,
    smoothLinearFactor = 1,
    formantCeiling = 2,
    samplingRate = 16000,
    speedSound = 35400,
    smoothing = list(),
    output = c('simple', 'detailed')[1],
    plot = FALSE,
    duration = NULL,
    colorTheme = c('bw', 'seewave', '...')[1],
    col = NULL,
    xlab = 'Time',
    ylab = 'Frequency, kHz',
    ...
) {
  # standard formatting
  formants = reformatFormants(formants)
  # if (is.list(formants)) {
  #   bandwidth_specified = as.numeric(which(unlist(
  #     lapply(formants, function(x) any(names(x) == 'width'))
  #   )))
  # } else {
  #   bandwidth_specified = numeric(0)
  # }

  if (!is.null(vocalTract)) {
    if (!any(is.na(vocalTract))) {
      if (is.list(vocalTract) |
          (is.numeric(vocalTract) & length(vocalTract) > 1)) {
        vocalTract = do.call(getSmoothContour, c(smoothing, list(
          vocalTract,
          len = nc,
          valueFloor = permittedValues['vocalTract', 'low'],
          valueCeiling = permittedValues['vocalTract', 'high'],
          plot = FALSE
        )))  # vocalTract is now either NULL/NA or numeric of length nc
      }
    }
  }

  ## estimate vocal tract length
  if (!is.list(vocalTract) & !is.numeric(vocalTract) & is.list(formants)) {
    # if we don't know vocalTract, but at least one formant is defined,
    # we guess the length of vocal tract
    vocalTract = estimateVTL(formants = formants,
                             speedSound = speedSound,
                             checkFormat = TRUE)  # may need to remove non-integer
  }

  # if is.na(formants) or if there's something wrong with it,
  # we fall back on vocalTract to make a schwa
  if (length(formants[[1]]) < 2 & is.numeric(vocalTract) &
      temperature > 0 & formantDep > 0 & formantDepStoch > 0) {
    freq = speedSound / 4 / vocalTract
    formants = list('f1' = data.frame(
      'time' = seq(0, 1, length.out = length(freq)),
      'freq' = freq,
      'amp' = NA,
      'width' = getBandwidth(freq)  # corrected Tappert, Martony, and Fant (TMF)-1963
    ))
  }

  # create a "spectrogram"-shaped filter matrix
  spectralEnvelope = matrix(0, nrow = nr, ncol = nc)


  ### START OF FORMANTS
  if (is.list(formants)) {
    ## Upsample to the length of fft steps
    formants_upsampled = vector('list', length = length(formants))
    for (f in seq_along(formants)) {
      formant_f = data.frame(time = seq(0, 1, length.out = nc))
      for (v in c('freq', 'amp', 'width')) {
        if (length(formants[[f]][, v]) > 1 & !any(is.na(formants[[f]][, v]))) {
          # just spline produces imprecise, overly smoothed curves. Loess is just
          # too slow for this. So we apply linear interpolation to formant values
          # first, to get a fairly straight line between anchors, and THEN smooth
          # it out with spline
          if (v == 'freq') {
            formant_f[, v] =  do.call(getSmoothContour, c(smoothing, list(
              formants[[f]][, v],
              len = nc,
              smoothing = smoothing,
              valueFloor = 1,
              plot = FALSE
            )))
          } else {
            # just use approx for the rest to speed things up
            formant_f[, v] = approx(
              formants[[f]][, v],
              n = nc,
              x = formants[[f]]$time)$y
          }
        } else {
          formant_f[, v] = rep(formants[[f]][1, v], nc)
        }
      }
      formant_f$freq = formant_f$freq * vocalTract[1] / vocalTract
      # if (!any(bandwidth_specified == f)) {
      #   formant_f$width = getBandwidth(formant_f$freq)
      # }
      formants_upsampled[[f]] = formant_f
    }
    names(formants_upsampled) = names(formants)
    nFormants = length(formants)
    amplScaleFactor = rep(1, nFormants)

    ## Stochastic part (only for temperature > 0)
    if (temperature > 0) {
      if (formDisp == 0) formDisp = 1e-6  # otherwise division by 0
      # non-integer formants like "f1.4" refer to extra zero-pole pairs.
      # They should not be considered for VTL estimation or for adding formants
      non_integer_formants = grepl('.', names(formants_upsampled), fixed = TRUE)
      # create a few new, relatively high-frequency extra formants
      if(!is.numeric(formantDepStoch)) formantDepStoch = 1
      if (!is.numeric(vocalTract) & length(formants) > 1 &
          formantDepStoch > 0 & formantDep > 0) {
        ff = sapply(formants[!non_integer_formants], function(x) x$freq[1])
        formantDispersion = getFormantDispersion(ff,
                                                 speedSound = speedSound,
                                                 method = 'regression')
      } else if (is.numeric(vocalTract)) {
        formantDispersion = speedSound / (2 * vocalTract)
      } else {
        formantDispersion = NA # making sdG also NA, ie extra formants not added
      }
      sdG = formantDispersion * temperature * formDisp
      nFormants_integer = length(formants_upsampled) - sum(non_integer_formants)
      freq_max = max(formants_upsampled[[nFormants]][, 'freq'])

      if (!any(is.na(sdG))) {
        # formant_f = (2 * f - 1) / 2 * formantDispersion,
        # therefore, to generate formants to 2 * Nyquist
        # (to compensate for downward drag of lower formants)
        # 2 * nyquist = (2 * nExtraFormants - 1) / 2 * formantDispersion
        # Solving for nExtraFormants gives (nyquist * 4 / formantDispersion + 1) / 2:
        nExtraFormants = round(
          (samplingRate * formantCeiling / min(formantDispersion) + 1) / 2
        ) - nFormants
        if (is.numeric(nExtraFormants) && nExtraFormants > 0) {
          # if we are going to add extra formants
          nf = length(formantDispersion)
          extraFreqs = extraWidths = matrix(NA, nrow = nf, ncol = nExtraFormants)
          extraAmps = rgamma(
            nExtraFormants,
            # mean = formantDepStoch, sd = formantDepStoch * temperature
            1 / temperature ^ 2,
            1 / (formantDepStoch * temperature ^ 2)
          )
          amplScaleFactor = c(amplScaleFactor, extraAmps)
          for (frame in 1:nf) {
            # once for static vtl, for each frame in 1:nc otherwise
            idx = (nFormants_integer + 1) : (nFormants_integer + nExtraFormants)
            extraFreqs_regular = (2 * idx - 1) / 2 * formantDispersion[frame]
            extraFreqs[frame, ] = rgamma(
              nExtraFormants,
              # mean = extraFreqs_regular, sd = sdG
              extraFreqs_regular ^ 2 / sdG[frame] ^ 2,
              extraFreqs_regular / sdG[frame] ^ 2
            )
            extraWidths[frame, ] = getBandwidth(extraFreqs[frame, ])
          }

          formants_upsampled = c(formants_upsampled, vector('list', nExtraFormants))
          for (f in 1:nExtraFormants) {
            formants_upsampled[[nFormants + f]] = data.frame (
              'time' = formants_upsampled[[1]][, 'time'],
              'freq' = extraFreqs[, f],
              'amp' = NA,
              'width' = extraWidths[, f]
            )
          }
        }
      }

      # wiggle both user-specified and stochastic formants
      nFormants = length(formants_upsampled)
      for (f in 1:nFormants) {
        for (c in 2:4) {
          # wiggle freq, ampl and bandwidth independently
          rw = getRandomWalk(
            len = nc,
            rw_range = temperature * formDrift,
            rw_smoothing = 0.3,
            trend = rnorm(1)
          )
          # if nc == 1, returns one number close to 1
          if (length(rw) > 1) {
            # for actual random walks, make sure mean is 1
            rw = rw - mean(rw) + 1
          }
          formants_upsampled[[f]][, c] = formants_upsampled[[f]][, c] * rw
        }
      } # end of wiggling formants
    } # end of if temperature > 0

    ## Deterministic part
    # convert formant freqs and widths from Hz to bins
    bin_width = samplingRate / 2 / nr # Hz
    bin_freqs = seq(bin_width / 2, samplingRate / 2, length.out = nr) # Hz
    for (f in seq_along(formants_upsampled)) {
      formants_upsampled[[f]][, 'freq'] =
        (formants_upsampled[[f]][, 'freq'] - bin_width / 2) / bin_width + 1
      # frequencies expressed in bin indices (how many bin widths above the
      # central frequency of the first bin)
      formants_upsampled[[f]][, 'width'] =
        formants_upsampled[[f]][, 'width'] / bin_width * formantWidth
    }

    # mouth opening
    if (length(mouth) < 1 | any(is.na(mouth))) {
      mouthOpening_upsampled = rep(0.5, nc) # defaults to mouth half-open the
      # whole time - sort of hanging loosely agape ;))
      mouthOpen_binary = rep(1, nc)
    } else {
      mouthOpening_upsampled = do.call(getSmoothContour, c(smoothing, list(
        len = nc,
        anchors = mouth,
        valueFloor = permittedValues['mouthOpening', 'low'],
        valueCeiling = permittedValues['mouthOpening', 'high'],
        plot = FALSE
      )))
      # mouthOpening_upsampled[mouthOpening_upsampled < mouthOpenThres] = 0
      mouthOpen_binary = ifelse(mouthOpening_upsampled > mouthOpenThres, 1, 0)
    }
    # plot(mouthOpening_upsampled, type = 'l')

    # adjust formants for mouth opening
    adjustment_bins = 0
    if (!is.null(vocalTract)) {
      if (!any(is.na(vocalTract))) {
        # is.finite() returns F for NaN, NA, inf, etc
        adjustment_hz = (mouthOpening_upsampled - 0.5) * speedSound /
          (4 * vocalTract) # speedSound = 35400 cm/s, speed of sound in warm
        # air. The formula for mouth opening is adapted from Moore (2016) "A
        # Real-Time Parametric General-Purpose Mammalian Vocal Synthesiser".
        # mouthOpening = .5 gives no modification (neutral, "default" position).
        # Basically we could assume a closed-closed tube for closed mouth and a
        # closed-open tube for open mouth, but since formants can be specified
        # rather than calculated based on vocalTract, we just subtract half the
        # total difference between open and closed tubes in Hz from each formant
        # value as the mouth goes from half-open (neutral) to fully closed, or
        # we add half that value as the mouth goes from neutral to max open. NB:
        # so "closed" is actually "half-closed", and we assume that nostrils are
        # always open (so not really a closed-closed tube)
        adjustment_bins = adjustment_hz / bin_width
      }
    }
    for (f in 1:nFormants) {
      formants_upsampled[[f]][, 'freq'] =
        formants_upsampled[[f]][, 'freq'] + adjustment_bins
      # force each formant frequency to be positive (min 1 bin)
      formants_upsampled[[f]][, 'freq'] [formants_upsampled[[f]][, 'freq'] < 1] = 1
    }

    # nasalize the parts with closed mouth: see Hawkins & Stevens (1985);
    # http://www.cslu.ogi.edu/tutordemos/SpectrogramReading/cse551html/cse551/node35.html
    nasalizedIdx = which(mouthOpen_binary == 0) # or specify a separate
    # increase F1 bandwidth to 175 Hz
    formants_upsampled$f1[nasalizedIdx, 'width'] = 175 / bin_width
    # nasalization contour
    if (length(nasalizedIdx) > 0) {
      # add a pole
      formants_upsampled$fnp = formants_upsampled$f1
      formants_upsampled$fnp[, 'amp'] = 0
      formants_upsampled$fnp[nasalizedIdx, 'amp'] = NA
      formants_upsampled$fnp[nasalizedIdx, 'width'] =
        formants_upsampled$f1[nasalizedIdx, 'width'] * 2 / 3
      formants_upsampled$fnp[nasalizedIdx, 'freq'] =
        ifelse(
          formants_upsampled$f1[nasalizedIdx, 'freq'] > 550 / bin_width,
          formants_upsampled$f1[nasalizedIdx, 'freq'] - 250 / bin_width,
          formants_upsampled$f1[nasalizedIdx, 'freq'] + 250 / bin_width
        )
      # 250 Hz below or above F1, depending on whether F1 is above or below
      # 550 Hz

      # add a zero
      formants_upsampled$fnz = formants_upsampled$f1
      formants_upsampled$fnz[, 'amp'] = 0
      formants_upsampled$fnz[nasalizedIdx, 'amp'] = NA
      formants_upsampled$fnz[nasalizedIdx, 'freq'] =
        (formants_upsampled$fnp[nasalizedIdx, 'freq'] +
           formants_upsampled$f1[nasalizedIdx, 'freq']) / 2  # midway between
      # f1 and fnp
      formants_upsampled$fnz[nasalizedIdx, 'width'] =
        formants_upsampled$fnp[nasalizedIdx, 'width']
      # modify f1
      formants_upsampled$f1[nasalizedIdx, 'amp'] =
        formants_upsampled$f1[nasalizedIdx, 'amp'] * 4 / 5
      formants_upsampled$f1[nasalizedIdx, 'width'] =
        formants_upsampled$f1[nasalizedIdx, 'width'] * 5 / 4
      nFormants = length(formants_upsampled)
      amplScaleFactor = c(amplScaleFactor, .5, .5)
      # make the added zero-pole half as strong as ordinary formants
    }

    # Add formants to spectrogram (Stevens 2000, Ch. 3, ~p. 137)
    freqs_bins = 1:nr
    poles = 1:nFormants
    zeros = as.numeric(which(sapply(
      formants_upsampled, function(x) any(x[, 'amp'] < 0)
    )))
    if (length(zeros) > 0) {
      poles = poles[-zeros]
      for (z in zeros)
        formants_upsampled[[z]]$amp = -formants_upsampled[[z]]$amp
      # need to have positive amp values (we know which ones are zeros)
    }
    s = complex(real = 0, imaginary = 2 * pi * freqs_bins)
    for (f in 1:nFormants) {
      pf = 2 * pi * formants_upsampled[[f]]$freq
      bp = -formants_upsampled[[f]]$width * pi
      sf = complex(real = bp, imaginary = pf)
      sfc = Conj(sf)
      formant = matrix(0, nrow = nr, ncol = nc)
      for (c in 1:nc) {
        pole = any(poles == f)
        numerator = .subset2(sf, c) * .subset2(sfc, c)
        denominator = (s - .subset2(sf, c)) * (s - .subset2(sfc, c))
        if (pole) {
          tns =  numerator / denominator  # pole
        } else {
          tns = denominator / numerator   # zero
        }
        formant[, c] = log10(abs(tns))
        if (is.na(formants_upsampled[[f]]$amp[c])) {
          # just convert to dB
          formant[, c] = formant[, c] * 20 * amplScaleFactor[f]
        } else {
          # normalize ampl to be exactly as specified in dB
          m = ifelse(pole, max(formant[, c]), -min(formant[, c]))
          formant[, c] = formant[, c] / m *
            formants_upsampled[[f]]$amp[c] * amplScaleFactor[f]
          # amplScaleFactor is 1 for user-specified and formantDepStoch otherwise
        }
      }
      # plot(formant[, c], type = 'l')
      spectralEnvelope = spectralEnvelope + formant
    }
    spectralEnvelope = spectralEnvelope * formantDep
  } else {
    mouthOpen_binary = rep(1, nc)
    mouthOpening_upsampled = rep(0.5, nc)
    bin_width = samplingRate / 2 / nr # otherwise it's not defined if formants = NULL
  }

  # save frequency and time stamps
  freqs = (0:(nr - 1)) * bin_width / 1000
  rownames(spectralEnvelope) = freqs
  if (is.numeric(duration)) {
    colnames(spectralEnvelope) = seq(0, duration, length.out = nc)
  } else {
    colnames(spectralEnvelope) = seq(0, 1, length.out = nc)
  }
  # plot(freqs, spectralEnvelope[, 1], type = 'l')
  # image(t(spectralEnvelope))

  # add correction for not adding higher formants
  if (FALSE) {
    # add correction for not adding higher formants
    # rolloffAdjust = 0 - 12 * log2(((nr*1):1)) [1:nr]
    # rolloffAdjust = rolloffAdjust - min(rolloffAdjust)
    # spectralEnvelope = apply(spectralEnvelope,
    #                          2,
    #                          function(x) x + rolloffAdjust)

    # or:
    # s = as.matrix(data.frame(freq = freqs,
    #                          amp = spectralEnvelope[, 1]))
    # peaks = as.data.frame(seewave::fpeaks(s, f = samplingRate, plot = FALSE))
    # mod = nls(amp ~ a + b * freq ^ c, peaks, start = list(a = 0, b = -1, c = 1))
    # if (FALSE) {
    #   peaks$pred = predict(mod)
    #   plot(peaks$freq, peaks$amp, type = 'b')
    #   points(peaks$freq, peaks$pred, type = 'l', col = 'red')
    # }
    #
    # ms = summary(mod)
    # msc = ms$coefficients[, 1]
    # specAdjust = as.numeric(-(msc[1] + msc[2] * s[, 'freq'] ^ msc[3]))
    # specAdjust = specAdjust - min(specAdjust)
    # for (c in 1:ncol(spectralEnvelope)) {
    #   spectralEnvelope[, c] = spectralEnvelope[, c] + specAdjust
    # }
  }
  # END OF FORMANTS

  # add lip radiation when the mouth is open and nose radiation when the mouth
  # is closed
  lip_dB = lipRad * log2(1:nr) # vector of length nr
  nose_dB = noseRad * log2(1:nr)
  # plot(lip_dB, type = 'l'); plot(nose_dB, type = 'l')
  for (c in 1:nc) {
    spectralEnvelope[, c] = spectralEnvelope[, c] +
      lip_dB * mouthOpen_binary[c] +
      nose_dB * (1 - mouthOpen_binary[c]) +
      mouthOpen_binary[c] * openMouthBoost
  }
  # plot(spectralEnvelope[, 1], type = 'l')

  # convert from dB to linear multiplier of power spectrum
  spectralEnvelope_lin = 10 ^ (spectralEnvelope / 20)
  # plot(spectralEnvelope_lin[, 1], type = 'l')

  if (plot) {
    if (is.null(col)) {
      colfunc = switchColorTheme(colorTheme)
      col = colfunc(100)
    }
    image(x = as.numeric(colnames(spectralEnvelope)),
          y = as.numeric(rownames(spectralEnvelope)),
          z = t(spectralEnvelope),
          xlab = xlab,
          ylab = ylab,
          col = col,
          ...)
  }

  if (output == 'detailed') {
    if (exists('formants_upsampled')) {
      formantSummary = as.data.frame(matrix(NA, nrow = nFormants, ncol = nc))
      for (i in 1:nFormants) {
        formantSummary[i, ] = (formants_upsampled[[i]]$freq - 1) *
          bin_width + bin_width / 2  # from bins back to Hz
      }
      max_freqs = apply(formantSummary, 1, max)  # save only to Nyquist
      formantSummary = formantSummary[which(max_freqs < (samplingRate / 2)), ]
      colnames(formantSummary) = colnames(spectralEnvelope)
    } else {
      formantSummary = NULL
    }
    invisible(list(formantSummary = formantSummary,
                   specEnv = spectralEnvelope_lin,
                   specEnv_dB = spectralEnvelope))
  } else {
    invisible(spectralEnvelope_lin)
  }
}


#' Add formants
#'
#' A spectral filter that either adds or removes formants from a sound - that
#' is, amplifies or dampens certain frequency bands, as in human vowels. See
#' \code{\link{soundgen}} and \code{\link{getSpectralEnvelope}} for more
#' information. With \code{action = 'remove'} this function can perform inverse
#' filtering to remove formants and obtain raw glottal output, provided that you
#' can specify the correct formant structure. Instead of formants, any arbitrary
#' spectral filtering function can be applied using the \code{spectralEnvelope}
#' argument (eg for a low/high/bandpass filter).
#'
#' Algorithm: converts input from a time series (time domain) to a spectrogram
#' (frequency domain) through short-time Fourier transform (STFT), multiples by
#' the spectral filter containing the specified formants, and transforms back to
#' a time series via inverse STFT. This is a subroutine for voice synthesis in
#' \code{\link{soundgen}}, but it can also be applied to a recording.
#'
#' @seealso \code{\link{getSpectralEnvelope}} \code{\link{transplantFormants}}
#'   \code{\link{soundgen}}
#'
#' @inheritParams soundgen
#' @inheritParams spectrogram
#' @inheritParams addAM
#' @param action 'add' = add formants to the sound, 'remove' = remove formants
#'   (inverse filtering)
#' @param dB if NULL (default), the spectral envelope is applied on the original
#'   scale; otherwise, it is set to range from 1 to 10 ^ (dB / 20)
#' @param specificity a way to sharpen or blur the spectral envelope (spectrum ^
#'   specificity) : 1 = no change, >1 = sharper, <1 = blurred
#' @param spectralEnvelope (optional): as an alternative to specifying formant
#'   frequencies, we can provide the exact filter - a vector of non-negative
#'   numbers specifying the power in each frequency bin on a linear scale
#'   (interpolated to length equal to windowLength_points/2). A matrix
#'   specifying the filter for each STFT step is also accepted. The easiest way
#'   to create this matrix is to call soundgen:::getSpectralEnvelope or to use
#'   the spectrum of a recorded sound
#' @param zFun (optional) an arbitrary function to apply to the spectrogram
#'   prior to iSTFT, where "z" is the spectrogram - a matrix of complex values
#'   (see examples)
#' @param formDrift,formDisp scaling factors for the effect of temperature on
#'   formant drift and dispersal, respectively
#' @param windowLength_points length of FFT window, points
#' @param normalize "orig" = same as input (default), "max" = maximum possible
#'   peak amplitude, "none" = no normalization
#' @export
#' @examples
#' sound = c(rep(0, 1000), runif(8000) * 2 - 1, rep(0, 1000))  # white noise
#' # NB: pad with silence to avoid artefacts if removing formants
#' # playme(sound)
#' # spectrogram(sound, samplingRate = 16000)
#'
#' # add F1 = 900, F2 = 1300 Hz
#' sound_filtered = addFormants(sound, samplingRate = 16000,
#'                              formants = c(900, 1300))
#' # playme(sound_filtered)
#' # spectrogram(sound_filtered, samplingRate = 16000)
#'
#' # ...and remove them again (assuming we know what the formants are)
#' sound_inverse_filt = addFormants(sound_filtered,
#'                                  samplingRate = 16000,
#'                                  formants = c(900, 1300),
#'                                  action = 'remove')
#' # playme(sound_inverse_filt)
#' # spectrogram(sound_inverse_filt, samplingRate = 16000)
#'
#' \dontrun{
#' ## Perform some user-defined manipulation of the spectrogram with zFun
#' # Ex.: noise removal - silence all bins under threshold,
#' # say -0 dB below the max value
#' s_noisy = soundgen(sylLen = 200, addSilence = 0,
#'                    noise = list(time = c(-100, 300), value = -20))
#' spectrogram(s_noisy, 16000)
#' # playme(s_noisy)
#' zFun = function(z, cutoff = -50) {
#'   az = abs(z)
#'   thres = max(az) * 10 ^ (cutoff / 20)
#'   z[which(az < thres)] = 0
#'   return(z)
#' }
#' s_denoised = addFormants(s_noisy, samplingRate = 16000,
#'                          formants = NA, zFun = zFun, cutoff = -40)
#' spectrogram(s_denoised, 16000)
#' # playme(s_denoised)
#'
#' # If neither formants nor spectralEnvelope are defined, only lipRad has an effect
#' # For ex., we can boost low frequencies by 6 dB/oct
#' noise = runif(8000)
#' noise1 = addFormants(noise, 16000, lipRad = -6)
#' seewave::meanspec(noise1, f = 16000, dB = 'max0')
#'
#' # Arbitrary spectra can be defined with spectralEnvelope. For ex., we can
#' # have a flat spectrum up to 2 kHz (Nyquist / 4) and -3 dB/kHz above:
#' freqs = seq(0, 16000 / 2, length.out = 100)
#' n = length(freqs)
#' idx = (n / 4):n
#' sp_dB = c(rep(0, n / 4 - 1), (freqs[idx] - freqs[idx[1]]) / 1000 * (-3))
#' plot(freqs, sp_dB, type = 'b')
#' noise2 = addFormants(noise, 16000, lipRad = 0, spectralEnvelope = 10 ^ (sp_dB / 20))
#' seewave::meanspec(noise2, f = 16000, dB = 'max0')
#'
#' ## Use the spectral envelope of an existing recording (bleating of a sheep)
#' # (see also the same example with noise as source in ?generateNoise)
#' # (NB: this can also be achieved with a single call to transplantFormants)
#' data(sheep, package = 'seewave')  # import a recording from seewave
#' sound_orig = as.numeric(scale(sheep@left))
#' samplingRate = sheep@samp.rate
#' sound_orig = sound_orig / max(abs(sound_orig))  # range -1 to +1
#' # playme(sound_orig, samplingRate)
#'
#' # get a few pitch anchors to reproduce the original intonation
#' pitch = analyze(sound_orig, samplingRate = samplingRate,
#'   pitchMethod = c('autocor', 'dom'))$detailed$pitch
#' pitch = pitch[!is.na(pitch)]
#'
#' # extract a frequency-smoothed version of the original spectrogram
#' # to use as filter
#' specEnv_bleating = spectrogram(sound_orig, windowLength = 5,
#'  samplingRate = samplingRate, output = 'original', plot = FALSE)
#' # image(t(log(specEnv_bleating)))
#'
#' # Synthesize source only, with flat spectrum
#' sound_unfilt = soundgen(sylLen = 2500, pitch = pitch,
#'   rolloff = 0, rolloffOct = 0, rolloffKHz = 0,
#'   temperature = 0, jitterDep = 0, subDep = 0,
#'   formants = NULL, lipRad = 0, samplingRate = samplingRate,
#'   invalidArgAction = 'ignore')  # prevent soundgen from increasing samplingRate
#' # playme(sound_unfilt, samplingRate)
#' # seewave::meanspec(sound_unfilt, f = samplingRate, dB = 'max0')  # ~flat
#'
#' # Force spectral envelope to the shape of target
#' sound_filt = addFormants(sound_unfilt, formants = NULL,
#'   spectralEnvelope = specEnv_bleating, samplingRate = samplingRate)
#' # playme(sound_filt, samplingRate)  # playme(sound_orig, samplingRate)
#' # spectrogram(sound_filt, samplingRate)  # spectrogram(sound_orig, samplingRate)
#'
#' # The spectral envelope is now similar to the original recording. Compare:
#' par(mfrow = c(1, 2))
#' seewave::meanspec(sound_orig, f = samplingRate, dB = 'max0', alim = c(-50, 20))
#' seewave::meanspec(sound_filt, f = samplingRate, dB = 'max0', alim = c(-50, 20))
#' par(mfrow = c(1, 1))
#' # NB: but the source of excitation in the original is actually a mix of
#' # harmonics and noise, while the new sound is purely tonal
#' }
addFormants = function(
    x,
    samplingRate = NULL,
    formants = NULL,
    spectralEnvelope = NULL,
    action = c('add', 'remove')[1],
    dB = NULL,
    specificity = 1,
    zFun = NULL,
    vocalTract = NA,
    formantDep = 1,
    formantDepStoch = 1,
    formantWidth = 1,
    formantCeiling = 2,
    lipRad = 6,
    noseRad = 4,
    mouthOpenThres = 0,
    mouth = NA,
    temperature = 0.025,
    formDrift = 0.3,
    formDisp = 0.2,
    smoothing = list(),
    windowLength_points = 800,
    overlap = 75,
    normalize = c('max', 'orig', 'none')[1],
    play = FALSE,
    saveAudio = NULL,
    reportEvery = NULL,
    cores = 1,
    ...
) {
  formants = reformatFormants(formants)
  mouth = reformatFormants(mouth)

  # match args
  myPars = c(as.list(environment()), list(...))
  # exclude some args
  myPars = myPars[!names(myPars) %in% c(
    'x', 'samplingRate', 'reportEvery', 'cores', 'saveAudio',
    'formants', 'mouth')]
  myPars$formants = formants
  myPars$mouth = mouth

  pa = processAudio(x,
                    samplingRate = samplingRate,
                    saveAudio = saveAudio,
                    funToCall = '.addFormants',
                    myPars = myPars,
                    reportEvery = reportEvery,
                    cores = cores)
  # prepare output
  if (pa$input$n == 1) {
    result = pa$result[[1]]
  } else {
    result = pa$result
  }
  invisible(result)
}


#' Add formants per sound
#'
#' Internal soundgen function called by \code{\link{addFormants}}
#' @inheritParams addFormants
#' @param audio a list returned by \code{readAudio}
#' @keywords internal
.addFormants = function(
    audio,
    formants = NULL,
    spectralEnvelope = NULL,
    action = c('add', 'remove')[1],
    dB = NULL,
    specificity = 1,
    zFun = NULL,
    vocalTract = NA,
    formantDep = 1,
    formantDepStoch = 1,
    formantWidth = 1,
    formantCeiling = 2,
    lipRad = 6,
    noseRad = 4,
    mouthOpenThres = 0,
    mouth = NA,
    temperature = 0.025,
    formDrift = 0.3,
    formDisp = 0.2,
    smoothing = list(),
    windowLength_points = 800,
    overlap = 75,
    dynamicRange = 120,
    normalize = c('max', 'orig', 'none')[1],
    play = FALSE,
    ...
) {
  dynamicRange_lin = 10 ^ (-dynamicRange / 20)
  # prepare vocal tract filter (formants + some spectral noise + lip radiation)
  if (!any(audio$sound != 0) |
      (is.na(formants)[1] &
       is.na(vocalTract)[1] &
       (is.na(lipRad)[1] | lipRad == 0))) {
    # otherwise fft glitches
    soundFiltered = audio$sound
  } else {
    # pad input with one windowLength_points of 0 to avoid softening the attack
    sound = c(rep(0, windowLength_points),
              audio$sound,
              rep(0, windowLength_points))

    # for very short sounds, make sure the analysis window is no more
    #   than half the sound's length
    windowLength_points = min(windowLength_points, floor(length(sound) / 2))
    step = seq(1,
               max(1, (length(sound) - windowLength_points)),
               windowLength_points - (overlap * windowLength_points / 100))
    z = seewave::stdft(
      wave = as.matrix(sound),
      f = audio$samplingRate,
      wl = windowLength_points,
      zp = 0,
      step = step,
      wn = 'hamming',
      fftw = FALSE,
      scale = TRUE,
      complex = TRUE
    )
    nc = ncol(z)  # length(step) # number of windows for fft
    nr = nrow(z)  # windowLength_points / 2 # number of frequency bins for fft

    # are formants moving or stationary?
    # (basically always moving, unless temperature = 0 and nothing else changes)
    if (is.null(spectralEnvelope)) {
      if (temperature > 0) {
        movingFormants = TRUE
      } else {
        if (is.list(formants)) {
          movingFormants = max(sapply(formants, function(x) sapply(x, length))) > 1
        } else {
          movingFormants = FALSE
        }
        if (is.list(mouth)) {
          if (sum(mouth$value != .5) > 0) {
            movingFormants = TRUE
          }
        }
        if (is.list(vocalTract)) {
          if (length(vocalTract$value) > 1) {
            movingFormants = TRUE
          }
        }
      }
      nInt = ifelse(movingFormants, nc, 1)

      # prepare the filter
      spectralEnvelope = getSpectralEnvelope(
        nr = nr,
        nc = nInt,
        formants = formants,
        formantDep = formantDep,
        formantDepStoch = formantDepStoch,
        formantWidth = formantWidth,
        formantCeiling = formantCeiling,
        lipRad = lipRad,
        noseRad = noseRad,
        mouthOpenThres = mouthOpenThres,
        mouth = mouth,
        temperature = temperature,
        formDrift = formDrift,
        formDisp = formDisp,
        smoothing = smoothing,
        samplingRate = audio$samplingRate,
        vocalTract = vocalTract
      )
    } else {  # user-provided spectralEnvelope
      spectralEnvelope = as.matrix(spectralEnvelope)  # if a vector,
      # becomes a matrix with one row
      if (nrow(spectralEnvelope) > 1) {
        nInt = nc
        movingFormants = TRUE
      } else {  # vector
        nInt = 1
        movingFormants = FALSE
      }
      spectralEnvelope = interpolMatrix(
        spectralEnvelope,
        nr = nr,
        nc = nInt,
        interpol = 'approx'
      )
    }
    # image(t(spectralEnvelope))

    # filtering
    if (!is.null(dB) & is.numeric(dB)) {
      spectralEnvelope = spectralEnvelope ^ specificity
      # range(spectralEnvelope)
      spectralEnvelope = spectralEnvelope / max(spectralEnvelope) * max(abs(z)) * 10 ^ (dB / 20)
      spectralEnvelope[spectralEnvelope < 1] = 1

      # ran_orig = 20 * log10(range(spectralEnvelope) + 1e-4)
      # spectralEnvelope = (spectralEnvelope / max(spectralEnvelope) *
      #                       10 ^ (dB / 20)) ^ (dB / diff(ran_orig))

      # ran_orig = diff(20 * log10(range(spectralEnvelope)))
      # spectralEnvelope = spectralEnvelope ^ (dB / ran_orig)
    } else {
      spectralEnvelope = spectralEnvelope ^ specificity
    }
    if (action == 'add') {
      if (movingFormants) {
        z = z * spectralEnvelope
      } else {
        z = apply(z, 2, function(x)
          x * spectralEnvelope)
      }
    } else if (action == 'remove') {
      if (movingFormants) {
        z = z / spectralEnvelope
      } else {
        z = apply(z, 2, function(x)
          x / spectralEnvelope)
      }
    }

    # apply some arbitrary function to the spectrogram before iSTFT
    if (!is.null(zFun) && is.function(zFun))
      z = do.call(zFun, list(z = z, ...))

    # inverse fft
    z[abs(z) <= dynamicRange_lin] = 0  # anything under dynamicRange becomes 0
    soundFiltered = as.numeric(
      seewave::istft(
        z,
        f = audio$samplingRate,
        ovlp = overlap,
        wl = windowLength_points,
        output = "matrix"
      )
    )
    # spectrogram(soundFiltered, audio$samplingRate)

    # normalize
    if (normalize == 'max' | normalize == TRUE) {
      ms = try(max(abs(soundFiltered)))
      if (!inherits(ms, 'try-error')) {
        # soundFiltered = soundFiltered - mean(soundFiltered)
        soundFiltered = soundFiltered / max(abs(soundFiltered)) * audio$scale
      }
    } else if (normalize == 'orig') {
      ms = try(max(abs(soundFiltered)))
      if (!inherits(ms, 'try-error')) {
        soundFiltered = soundFiltered / max(abs(soundFiltered)) * audio$scale_used
      }
    }

    # remove zero padding
    l = length(soundFiltered)
    hl = seewave::env(soundFiltered[(l - windowLength_points + 1):l],
                      f = audio$samplingRate, envt = 'hil', plot = FALSE)
    tailIdx = suppressWarnings(min(which(hl < (.01 * max(hl)))))
    idx = l - windowLength_points + tailIdx
    if (!is.finite(idx)) idx = l # l - windowLength_points
    soundFiltered = soundFiltered[(windowLength_points + 1):idx]
    # osc(soundFiltered, audio$samplingRate)
  }

  if (play) playme(soundFiltered, audio$samplingRate)
  if (is.character(audio$saveAudio)) {
    filename = paste0(audio$saveAudio, '/', audio$filename_noExt, '.wav')
    writeAudio(soundFiltered, audio = audio, filename = filename)
  }

  # spectrogram(soundFiltered, audio$samplingRate, ylim = c(0, 4))
  # playme(soundFiltered, audio$samplingRate)
  return(soundFiltered)
}


#' Transplant formants
#'
#' Takes the general spectral envelope of one sound (\code{donor}) and
#' "transplants" it onto another sound (\code{recipient}). For biological sounds
#' like speech or animal vocalizations, this has the effect of replacing the
#' formants in the recipient sound while preserving the original intonation and
#' (to some extent) voice quality. Note that the amount of spectral smoothing
#' (specified with \code{freqWindow} or \code{blur}) is a crucial parameter: too
#' little smoothing, and noise between harmonics will be amplified, creasing
#' artifacts; too much, and formants may be missed. The default is to set
#' \code{freqWindow} to the estimated median pitch, but this is time-consuming
#' and error-prone, so set it to a reasonable value manually if possible. Also
#' ensure that both sounds have the same sampling rate.
#'
#' Algorithm: makes spectrograms of both sounds, interpolates and smooths or
#' blurs the donor spectrogram, flattens the recipient spectrogram, multiplies
#' the spectrograms, and transforms back into time domain with inverse STFT.
#'
#' @seealso \code{\link{transplantEnv}} \code{\link{getSpectralEnvelope}}
#'   \code{\link{addFormants}} \code{\link{spectrogram}} \code{\link{soundgen}}
#'
#' @inheritParams spectrogram
#' @param donor the sound that provides the formants (vector, Wave, or file) or
#'   the desired spectral filter (matrix) as returned by
#'   \code{\link{getSpectralEnvelope}}
#' @param recipient the sound that receives the formants (vector, Wave, or file)
#' @param freqWindow the width of smoothing window used to flatten the
#'   recipient's spectrum per frame. Defaults to median pitch of the donor (or
#'   of the recipient if donor is a filter matrix). If \code{blur} is NULL,
#'   \code{freqWindow} also controls the amount of smoothing applied to the
#'   donor's spectrogram
#' @param blur the amount of Gaussian blur applied to the donor's spectrogram as
#'   a faster and more flexible alternative to smoothing it per bin with
#'   \code{freqWindow}. Provide two numbers: frequency (Hz, normally
#'   approximately equal to freqWindow), time (ms) (NA / NULL / 0 means no
#'   blurring in that dimension). See examples and \code{\link{spectrogram}}
#' @export
#' @examples
#' \dontrun{
#' # Objective: take formants from the bleating of a sheep and apply them to a
#' # synthetic sound with any arbitrary duration, intonation, nonlinearities etc
#' data(sheep, package = 'seewave')  # import a recording from seewave
#' playme(sheep)
#' spectrogram(sheep, osc = TRUE)
#'
#' recipient = soundgen(
#'   sylLen = 1200,
#'   pitch = c(100, 300, 250, 200),
#'   vibratoFreq = 9, vibratoDep = 1,
#'   addSilence = 180,
#'   samplingRate = sheep@samp.rate,  # same as donor
#'   invalidArgAction = 'ignore')  # force to keep the low samplingRate
#' playme(recipient, sheep@samp.rate)
#' spectrogram(recipient, sheep@samp.rate, osc = TRUE)
#'
#' s1 = transplantFormants(
#'   donor = sheep,
#'   recipient = recipient,
#'   samplingRate = sheep@samp.rate)
#' playme(s1, sheep@samp.rate)
#' spectrogram(s1, sheep@samp.rate, osc = TRUE)
#'
#' # The spectral envelope of s1 will be similar to sheep's on a frequency scale
#' # determined by freqWindow. Compare the spectra:
#' par(mfrow = c(1, 2))
#' seewave::meanspec(sheep, dB = 'max0', alim = c(-50, 20), main = 'Donor')
#' seewave::meanspec(s1, f = sheep@samp.rate, dB = 'max0',
#'                   alim = c(-50, 20), main = 'Processed recipient')
#' par(mfrow = c(1, 1))
#'
#' # if needed, transplant amplitude envelopes as well:
#' s2 = transplantEnv(donor = sheep, samplingRateD = sheep@samp.rate,
#'                    recipient = s1, windowLength = 10)
#' playme(s2, sheep@samp.rate)
#' spectrogram(s2, sheep@samp.rate, osc = TRUE)
#'
#' # using "blur" to apply Gaussian blur to the donor's spectrogram instead of
#' # smoothing per frame with "freqWindow" (~2.5 times faster)
#' spectrogram(sheep, blur = c(150, 0))  # preview to select the amount of blur
#' s1b = transplantFormants(
#'   donor = sheep,
#'   recipient = recipient,
#'   samplingRate = sheep@samp.rate,
#'   freqWindow = 150,
#'   blur = c(150, 0))
#'   # blur: 150 = SD of 150 Hz along the frequency axis,
#'   #      0 = no smoothing along the time axis
#' playme(s1b, sheep@samp.rate)
#' spectrogram(s1b, sheep@samp.rate, osc = TRUE)
#'
#' # Now we use human formants on sheep source: the sheep asks "why?"
#' s3 = transplantFormants(
#'   donor = getSpectralEnvelope(
#'             nr = 512, nc = 100,  # fairly arbitrary dimensions
#'             formants = 'uaaai',
#'             samplingRate = sheep@samp.rate),
#'   recipient = sheep,
#'   samplingRate = sheep@samp.rate)
#' playme(s3, sheep@samp.rate)
#' spectrogram(s3, sheep@samp.rate, osc = TRUE)
#' }
transplantFormants = function(donor,
                              recipient,
                              samplingRate = NULL,
                              freqWindow = NULL,
                              blur = NULL,
                              dynamicRange = 80,
                              windowLength = 50,
                              step = NULL,
                              overlap = 90,
                              wn = 'gaussian',
                              zp = 0) {
  if (!is.null(step)) {
    overlap = (1 - step / windowLength) * 100  # for istft
  } else {
    step = windowLength * (1 - overlap / 100)
  }

  # Read inputs
  recipient = readAudio(
    recipient,
    input = checkInputType(recipient),
    samplingRate = samplingRate
  )
  if (!is.matrix(donor)) {
    # donor is a sound
    donor = readAudio(
      donor,
      input = checkInputType(donor),
      samplingRate = samplingRate
    )
    # Check that both sounds have the same sampling rate
    if (donor$samplingRate != recipient$samplingRate) {
      stop('Please use two sounds with the same sampling rate')
    }
  }
  samplingRate = recipient$samplingRate  # donor may be a matrix (filter)

  windowLength_points = floor(windowLength / 1000 * samplingRate / 2) * 2
  spec_recipient = spectrogram(
    recipient$sound,
    samplingRate = samplingRate,
    dynamicRange = dynamicRange,
    windowLength = windowLength,
    step = step,
    overlap = overlap,
    wn = wn,
    zp = zp,
    output = 'complex',
    padWithSilence = FALSE,
    plot = FALSE
  )
  if (!is.matrix(donor)) {
    # donor is a sound
    spec_donor = spectrogram(
      donor$sound,
      samplingRate = samplingRate,
      dynamicRange = dynamicRange,
      windowLength = windowLength,
      step = step,
      overlap = overlap,
      wn = wn,
      zp = zp,
      output = 'original',
      padWithSilence = FALSE,
      plot = FALSE
    )
    # Make sure the donor spec has the same dimensions as the recipient spec
    spec_donor_rightDim = interpolMatrix(m = spec_donor,
                                         nr = nrow(spec_recipient),
                                         nc = ncol(spec_recipient))
  } else {
    # donor is a matrix (spectrogram giving the desired formant structure)
    spec_donor_rightDim = interpolMatrix(m = donor,
                                         nr = nrow(spec_recipient),
                                         nc = ncol(spec_recipient))
  }
  rownames(spec_donor_rightDim) = rownames(spec_recipient)

  # Flatten the recipient spectrogram
  if (!is.numeric(freqWindow)) {
    if (!is.null(blur) && is.finite(blur)) {
      # set freqWindow to the amount of "blur" along the frequency axis
      freqWindow = blur[1]
    } else {
      if (is.matrix(donor)) {
        # set freqWindow to the median pitch of recipient
        anal_recipient = analyze(recipient$sound, samplingRate, plot = FALSE)
        freqWindow = median(anal_recipient$detailed$pitch, na.rm = TRUE)
      } else {
        # set freqWindow to the median pitch of donor
        anal_donor = analyze(donor$sound, samplingRate, plot = FALSE)
        freqWindow = median(anal_donor$detailed$pitch, na.rm = TRUE)
      }
    }
  }
  if (!is.finite(freqWindow)) {
    freqWindow = 400
    message('Failed to determine freqWindow based on pitch; defaulting to 400 Hz')
  }
  freqRange_kHz = diff(range(as.numeric(rownames(spec_recipient))))
  freqBin_Hz = freqRange_kHz * 1000 / nrow(spec_recipient)
  freqWindow_bins = round(freqWindow / freqBin_Hz, 0)
  if (freqWindow_bins < 3) {
    message(paste('freqWindow has to be at least 3 bins wide;
                  resetting to', ceiling(freqBin_Hz * 3)))
    freqWindow_rec_bins = 3
  }
  for (i in 1:ncol(spec_recipient)) {
    abs_s = abs(spec_recipient[, i])
    # plot(log(abs_s), type = 'l')
    sc = max(abs_s)
    cor_coef = .flatEnv(
      list(sound = abs_s,
           samplingRate = 1,  # sr not really needed
           scale = sc,
           scale_used = sc),
      method = 'peak',
      dynamicRange = dynamicRange,
      windowLength_points = freqWindow_bins) / abs_s
    # plot(Re(spec_recipient[, i]) * cor_coef, type = 'l')
    spec_recipient[, i] = spec_recipient[, i] * cor_coef
    # plot(abs(spec_recipient[, i]), type = 'l')
  }

  # Smooth the donor spectrogram
  if (is.null(blur)) {
    # method 1: extract smooth spectral envelope per frame
    # plot(spec_donor_rightDim[, 10], type = 'l')
    # image(t(log(spec_donor_rightDim + 1)))
    for (i in 1:ncol(spec_donor_rightDim)) {
      spec_donor_rightDim[, i] = getEnv(
        sound = spec_donor_rightDim[, i],
        windowLength_points = freqWindow_bins,
        method = 'peak'
      )
    }
  } else {
    # method 2: apply a Gaussian blur to the entire spectrogram
    bin_width = samplingRate / windowLength_points
    filt_dim = c(
      round(blur[1] / bin_width) * 2 + 1,  # frequency
      round(blur[2] / step) * 2 + 1  # time
    )
    spec_donor_rightDim = gaussianSmooth2D(
      spec_donor_rightDim,
      kernelSize = filt_dim,
      action = 'blur')
  }

  # Multiply the spectrograms and reconstruct the audio
  spec_recipient_new = spec_recipient * spec_donor_rightDim
  # image(log(abs(t(spec_recipient))))
  # image(log(t(spec_donor_rightDim)))
  # image(log(abs(t(spec_recipient_new))))

  recipient_new = as.numeric(
    seewave::istft(
      spec_recipient_new,
      f = samplingRate,
      ovlp = overlap,
      wl = windowLength_points,
      output = "matrix"
    )
  )
  # spectrogram(donor$sound, samplingRate)
  # spectrogram(recipient_new, samplingRate)
  return(recipient_new)
}