R/formants_utilities.R

Defines functions getFormants lockToFormants convertStringToFormants getBandwidth reformatFormants

Documented in convertStringToFormants getBandwidth getFormants lockToFormants reformatFormants

#' Reformat formants
#'
#' Internal soundgen function.
#'
#' Checks that the formants are formatted in a valid way and expands them to a
#' standard list of dataframes with time, frequency, amplitude, and bandwidth of
#' each formant specified explicitly.
#' @param formants character string like "aoiu", numeric vector like "c(500,
#'   1500, 2400)", or a list with an entry for each formant
#' @param output 'all' (default) includes times stamps, freqs, amplitudes, and
#'   bandwidths; 'freqs' includes only frequencies
#' @param keepNonInteger if FALSE, fractional (anti)formants like 'f1.5' are
#'   removed
#' @keywords internal
#' @examples
#' soundgen:::reformatFormants(NA)
#' soundgen:::reformatFormants('aau')
#' soundgen:::reformatFormants(c(500, 1500, 2500))
#' soundgen:::reformatFormants(list(f1 = 500, f2 = c(1500, 1700)))
#' soundgen:::reformatFormants(list(
#'      f1 = list(freq = 800, amp = 30),
#'      f2 = list(freq = c(1500, 1700, 2200), width = c(100, 150, 175))
#' ))
#'
#' f = list(f1 = c(550, 600), f2 = c(1100, NA, 1600), f2.5 = 2500, f3 = 3000)
#' soundgen:::reformatFormants(f)
#' soundgen:::reformatFormants(f, output = 'freqs')
#' soundgen:::reformatFormants(f, output = 'freqs', keepNonInteger = FALSE)
#'
#' soundgen:::reformatFormants(c(500, 1400), output = 'freqs')
#' soundgen:::reformatFormants(list(f1 = 500, f2 = 1400), output = 'freqs')
#' soundgen:::reformatFormants(list(f1 = c(570, 750),
#'   f2 = NA, f3 = c(2400, 2200, NA)), output = 'freqs')
reformatFormants = function(formants,
                            output = c('all', 'freqs')[1],
                            keepNonInteger = TRUE) {
  if (is.null(formants)) {
    formants = NA
  } else if (is.character(formants)) {
    # "aui" etc - read off values from presets$M1
    formants = convertStringToFormants(formants)
  } else if (is.numeric(formants)) {
    # expand to full format from e.g. "formants = c(500, 1500, 2500)"
    freqs = formants
    formants = vector('list', length(freqs))
    names(formants) = paste0('f', 1:length(freqs))
    if (output == 'all') {
      for (f in 1:length(freqs)) {
        formants[[f]] = data.frame(time = 0,
                                   freq = freqs[f],
                                   amp = NA,
                                   width = getBandwidth(freqs[f]))
      }
    } else {
      # just the frequencies
      for (f in 1:length(freqs)) {
        formants[[f]] = data.frame(freq = freqs[f])
      }
    }
  }
  if (is.list(formants)) {
    if (is.null(names(formants))) {
      names(formants) = paste0('f', 1:length(formants))
    }
    for (f in 1:length(formants)) {
      formant = as.data.frame(formants[[f, drop = FALSE]])
      if (ncol(formant) == 1) colnames(formant) = 'freq'
      if (is.list(formant)) {
        if ('freq' %in% names(formant)) {
          if (output == 'all') {
            # expand to full format from e.g. f1 = list(freq = 550, amp = 30)
            if (is.null(formant$time)) {
              formant$time = seq(0, 1, length.out = nrow(formant))
            }
            if (is.null(formant$amp)) {
              formant$amp = NA
            }
            if (is.null(formant$width)) {
              formant$width = getBandwidth(formant$freq)
            }
          }
        }
      } else {
        # expand to full format from e.g. f1 = 550
        # (numbers are assumed to represent frequency)
        if (output == 'all') {
          formant = data.frame(
            time = seq(0, 1, length.out = length(formant)),
            freq = formant,
            amp = rep(NA, length(formant)),
            width = getBandwidth(formant)
          )
        } else {
          formant = data.frame(freq = as.numeric(formant))
        }
      }
      if (output == 'all') {
        # make sure columns are in the right order (for esthetics & debugging)
        formants[[f]] = formant[, c('time', 'freq', 'amp', 'width')]
      } else {
        formants[[f]] = formant
      }
    }
  } else if (!is.null(formants)) {
    if (any(!is.na(formants))) {
      stop('If defined, formants must be either a list or a string of characters
         from dictionary presets: a, o, i, e, u, 0 (schwa)')
    } else {
      return(NA)
    }
  }

  if (!keepNonInteger) {
    # remove non-integer (anti)formants, if any
    non_integer_formants = apply(as.matrix(names(formants)),
                                 1,
                                 function(x) {
                                   grepl('.', x, fixed = TRUE)
                                 })
    if (any(non_integer_formants)) formants =  formants[!non_integer_formants]
  }

  return(formants)
}



#' Get bandwidth
#'
#' Internal soundgen function.
#'
#' Calculates formant bandwidth as a function of formant frequencies using a
#' modified version of TMF-63 formula. Namely, above 500 Hz it follows the
#' original formula from Tappert, Martony, and Fant (TMF)-1963, and below 500 Hz
#' it applies a correction to allow for energy losses at low frequencies. See
#' Khodai-Joopari & Clermont (2002), "Comparison of formulae for estimating
#' formant bandwidths". Below 250 Hz the bandwidth is forces to drop again to
#' avoid very large values near zero (just guesswork!)
#' @param f a vector of formant frequencies, Hz
#' @keywords internal
#' @examples
#' f = 1:5000
#' plot(f, soundgen:::getBandwidth(f), type = 'l',
#'   xlab = 'Formant frequency, Hz', ylab = 'Estimated bandwidth, Hz')
getBandwidth = function(f) {
  b = rep(NA, length(f))
  f1 = which(f < 250)
  f2 = which(f >= 250 & f < 500)
  f3 = which(f >= 500)
  # just guesswork below 250 Hz - no data for such low freqs,
  # apart from elephant rumbles etc.
  b[f1] = 26.38541 - .042 * f[f1] + .0011 * f[f1] ^ 2
  # see Khodai-Joopari & Clermont 2002
  b[f2] =  52.08333 * (1 + (f[f2]-500) ^ 2/ 10 ^ 5)
  b[f3] = 50 * (1 + f[f3] ^ 2 / 6 / 10 ^ 6)
  # plot(f, b, type = 'l')
  return(b)
}


#' Prepare a list of formants
#'
#' Internal soundgen function.
#'
#' Takes a string of phonemes entered WITHOUT ANY BREAKS. Recognized phonemes in
#' the human preset dictionary: vowels "a" "o" "i" "e" "u" "0" (schwa);
#' consonants "s" "x" "j".
#' @param phonemeString a string of characters from the dictionary of phoneme
#'   presets, e.g., uaaaaii (short u - longer a - medium-long i)
#' @param speaker name of the preset dictionary to use
#' @return Returns a list of formant values, which can be fed directly into
#'   \code{\link{getSpectralEnvelope}}
#' @keywords internal
#' @examples
#' formants = soundgen:::convertStringToFormants(phonemeString = 'a')
#' formants = soundgen:::convertStringToFormants(
#'   phonemeString = 'au', speaker = 'M1')
#' formants = soundgen:::convertStringToFormants(
#'   phonemeString = 'aeui', speaker = 'F1')
#' formants = soundgen:::convertStringToFormants(
#'   phonemeString = 'aaeuiiiii', speaker = 'Chimpanzee')
convertStringToFormants = function(phonemeString, speaker = 'M1') {
  availablePresets = names(presets[[speaker]]$Formants$vowels)
  if (length(availablePresets) < 1) {
    warning(paste0('No phoneme presets found for speaker ', speaker,
                   '. Defaulting to M1'))
    speaker = 'M1'
    availablePresets = names(presets[[speaker]]$Formants$vowels)
  }
  input_phonemes = strsplit(phonemeString, "")[[1]]
  valid_phonemes = input_phonemes[input_phonemes %in% availablePresets]
  unique_phonemes = unique(valid_phonemes)
  if (length(valid_phonemes) < 1)
    return(NA)

  # for each input vowel, look up the corresponding formant values
  # in the presets dictionary and append to formants
  vowels = list()
  formantNames = character()
  for (v in 1:length(unique_phonemes)) {
    vowels[[v]] = presets[[speaker]]$Formants$vowels[unique_phonemes[v]][[1]]
    formantNames = c(formantNames, names(vowels[[v]]))
  }
  formantNames = sort(unique(formantNames))
  names(vowels) = unique_phonemes

  # make sure we have filled in info on all formants from the entire
  # sequence of vowels for each individual vowel
  for (v in 1:length(vowels)) {
    absentFormants = formantNames[!formantNames %in% names(vowels[[v]])]
    for (f in absentFormants) {
      closestFreq = unlist(sapply(vowels, function(x) x[f]))
      # names_stripped = substr(names(closestFreq),
      #                         nchar(names(closestFreq)) - 3,
      #                         nchar(names(closestFreq)))
      # closestFreq = closestFreq[which(names_stripped == 'freq')]
      vowels[[v]] [[f]] = as.numeric(na.omit(closestFreq))[1]
      # NB: instead of the last [1], ideally we should specify some intelligent
      # way to pick up the closest vowel with this missing formant, not just the
      # first one, but that's only a problem in long sequences of vowels with
      # really different numbers of formants (nasalization)
    }
  }

  # initialize a common list of exact formants
  formants = vector("list", length(formantNames))
  names(formants) = formantNames

  # for each vowel, append its formants to the common list
  for (v in 1:length(valid_phonemes)) {
    vowel = vowels[[valid_phonemes[v]]]
    for (f in 1:length(vowel)) {
      formantName = names(vowel)[f]
      formants[[formantName]] = c(formants[[formantName]], vowel[[f]])
    }
  }

  return(formants)
}


#' Lock to formants
#'
#' Internal soundgen function
#'
#' When f0 or another relatively strong harmonic is close to one of the
#' formants, the pitch contour is modified so as to "lock" it to this formant.
#' The relevant metric is energy gain (ratio of amplitudes before and after the
#' adjustment) penalized by the magnitude of the necessary pitch jump (in
#' semitones) and the amplitude of the locked harmonic relative to f0.
#' @param pitch pitch contour, numeric vector (normally pitch_per_gc)
#' @param specEnv spectral envelope as returned by getSpectralEnvelope
#' @param rolloffMatrix rolloff matrix as returned by getRolloff
#' @param formantSummary matrix of exact formant frequencies (formants in rows,
#'   time in columns)
#' @param lockProb the (approximate) proportion of sound affected by formant
#'   locking
#' @param minLength the minimum number of consecutive pitch values affected
#'   (shorter segments of formant locking are ignored)
#' @param plot if TRUE, plots the original and modified pitch contour
#' @keywords internal
#' @examples
#' n = 50
#' pitch = getSmoothContour(len = n, anchors = c(600, 2000, 1900, 400),
#'   thisIsPitch = TRUE, plot = TRUE)
#' rolloffMatrix = getRolloff(pitch_per_gc = pitch)
#' specEnv = getSpectralEnvelope(nr = 512, nc = length(pitch),
#'   formants = list(f1 = c(800, 1200), f2 = 2000, f3 = c(3500, 3200)),
#'   lipRad = 0, temperature = .00001, plot = TRUE)
#' formantSummary = t(data.frame(f1 = c(800, 1200), f2 = c(2000, 2000), f3 = c(3500, 3200)))
#' pitch2 = soundgen:::lockToFormants(pitch = pitch, specEnv = specEnv,
#'   rolloffMatrix = rolloffMatrix,
#'   formantSummary = formantSummary,
#'   lockProb = .5, minLength = 5, plot = TRUE)
#' pitch3 = soundgen:::lockToFormants(pitch = pitch, specEnv = specEnv,
#'   rolloffMatrix = rolloffMatrix,
#'   formantSummary = formantSummary,
#'   lockProb = list(time = c(0, .7, 1), value = c(0, 1, 0)),
#'   minLength = 5, plot = TRUE)
lockToFormants = function(pitch,
                          specEnv,
                          formantSummary,
                          rolloffMatrix = NULL,
                          lockProb = .1,
                          minLength = 3,
                          plot = FALSE) {
  if (!is.null(rolloffMatrix)) {
    nr = nrow(rolloffMatrix)
  } else {
    nr = 1
    rolloffMatrix = matrix(1)
  }
  n = length(pitch)
  if (is.list(lockProb)) {
    lockProb = getSmoothContour(
      anchors = lockProb,
      len = n,
      valueFloor = permittedValues['formantLocking', 'low'],
      valueCeiling = permittedValues['formantLocking', 'high']
    )
  }
  freqs = as.numeric(rownames(specEnv)) * 1000
  specEnv = interpolMatrix(
    specEnv,
    nr = nrow(specEnv),
    nc = n,
    interpol = 'approx'
  )
  # rownames(specEnv) = freqs
  formants = interpolMatrix(as.matrix(formantSummary), nc = n)

  # Calculate how much we can gain by locking a harmonic to a formant at each time point
  d = data.frame(pitch = pitch)
  d$pitch2 = d$pitch1 = d$pitch
  d[, c('idx_max_gain', 'gain', 'nearest_formant')] = NA
  for (i in 1:n){  # for each time point (normally glottal cycle in pitch_per_gc)
    if (FALSE) {
      plot(freqs, specEnv[, i], type = 'l')
      abline(v = pitch[i], lty = 3)
    }
    # formants = as.data.frame(seewave::fpeaks(cbind(freqs, specEnv[, i]),
    #                                          threshold = 1, plot = FALSE))

    # for each harmonic in rolloffMatrix
    temp = data.frame(harmonic = 1:nr, gain = NA, nearest_formant = NA)
    for (h in 1:nr) {  # for each harmonic
      nearest_formant = which.min(abs(formants[, i] - pitch[i] * h))
      nearest_formant_amp = specEnv[which.min(abs(formants[nearest_formant, i] - freqs)), i]
      idx_cur = which.min(abs(freqs - pitch[i] * h))
      amp_gain = nearest_formant_amp / specEnv[idx_cur, i]
      # amp_gain ~ how much E can be gained if locked to nearest formant
      dist_to_nearest_formant = max(1, 12 * abs(log2(pitch[i] * h) - log2(formants[nearest_formant, i])))
      # in semitones (anything closer than 1 semitone counts as 1 semitone to
      # standardize penalization, otherwise we divide by very small numbers and
      # thus inflate apparent gain)
      temp$gain[h] = amp_gain / dist_to_nearest_formant * rolloffMatrix[h, i]
      temp$nearest_formant[h] = formants[nearest_formant, i]
    }
    d$idx_max_gain[i] = which.max(temp$gain)
    d$gain[i] = temp$gain[d$idx_max_gain[i]]
    d$nearest_formant[i] = temp$nearest_formant[d$idx_max_gain[i]]
    # the prob of locking to the nearest formant is some function of the gain in E
    # penalized by dist_to_nearest_formant
  }

  # find the frames in which formant locking should be applied
  lockThreshold = as.numeric(quantile(d$gain, probs = 1 - lockProb))  # can be a vector
  lock_idx = which(d$gain > lockThreshold)
  d$pitch1[lock_idx] = d$nearest_formant[lock_idx] / d$idx_max_gain[lock_idx]

  # only preserve sufficiently long fragments of formant locking
  d$pitch2 = d$pitch1
  if (is.numeric(minLength) && minLength > 1) {
    d$modif = (d$pitch != d$pitch1)
    r = rle(d$modif)
    r = data.frame(len = r[[1]], val = r[[2]], idx = 1)
    if (r$len[1] < minLength) r$val[1] = FALSE
    for (i in 2:nrow(r)) {
      r$idx[i] = sum(r$len[1:(i - 1)]) + 1
      if (r$len[i] < minLength) {
        r$val[i] = FALSE
      }
    }
    r1 = r[r$val == TRUE, , drop = FALSE]
    d$modif1 = FALSE
    #modif1 = soundgen:::clumper(modif, minLength = 5)
    if (nrow(r1) >= 1) {
      for (j in 1:nrow(r1)) {
        d$modif1[r1$idx[j] : (r1$idx[j] + r1$len[j] - 1)] = TRUE
      }
    }
    # + exclude rapid changes b/w the harmonic to which to lock? Tricky b/c we need to access different formant frequencies again, or find some intelligent way of removing pitch jumps

    # un-modify short fragments, reverting to the original pitch
    no_modif = which(d$modif1 == FALSE)
    d$pitch2 = d$pitch1
    d$pitch2[no_modif] = d$pitch[no_modif]
  }


  # # median smoothing to remove weird pitch jumps
  # d$pitch2 = soundgen:::medianSmoother(data.frame(p = d$pitch1),
  #                                             smoothing_ww = 30,
  #                                             smoothingThres = 1)

  if (plot) {
    y_min = min(d[, c('pitch', 'pitch1', 'pitch2')]) / 1.1
    y_max = max(d[, c('pitch', 'pitch1', 'pitch2')]) * 1.1
    plot(d$pitch, type = 'l', lty = 2, col = 'red', ylim = c(y_min, y_max))
    points(d$pitch1, type = 'l', lty =3, col = 'blue')
    points(d$pitch2, type = 'l')
    # for (f in 1:length(formants)) {
    #   abline(h = formants[[f]]$freq, lty = 3)
    # }
  }

  invisible(d$pitch2)
}


#' Get formants
#'
#' Internal soundgen function
#'
#' A barebones version of analyze() that only measures formants. Called by
#' formant_app()
#' @param audio input sound as returned by readAudio
#' @inheritParams analyze
#' @keywords internal
#' @examples
#' data(sheep, package = 'seewave')
#' f = soundgen:::getFormants(soundgen:::readAudio(sheep, from = .1, to = .5))
#' f[11:15, 1:5]
getFormants = function(
    audio,
    windowLength = 50,
    step = 25,
    wn = 'gaussian',
    zp = 0,
    dynamicRange = 80,
    silence = 0.04,
    formants = list(),
    nFormants = NULL) {
  windowLength_points = audio$samplingRate * (windowLength / 1000)
  frameBank = getFrameBank(
    sound = audio$sound,
    samplingRate = audio$samplingRate,
    windowLength_points = windowLength_points,
    wn = wn,
    step = step,
    zp = zp,
    normalize = TRUE,
    filter = NULL,
    padWithSilence = FALSE,
    timeShift = audio$timeShift
  )
  timestamps = as.numeric(colnames(frameBank))
  myseq = (timestamps - audio$timeShift * 1000 - windowLength / 2) *
    audio$samplingRate / 1000 + 1
  myseq[1] = 1  # just in case of rounding errors
  l = length(myseq)
  myseq[l] = min(myseq[l], audio$ls - windowLength_points)

  # which frames are silent
  m = max(abs(audio$sound))
  m_to_scale = m / audio$scale
  silence = silence * m / m_to_scale
  ampl = apply(as.matrix(1:length(myseq)), 1, function(x) {
    # perceived intensity - root mean square of amplitude
    # (NB: m / scale corrects the scale back to original, otherwise sound is [-1, 1])
    sqrt(mean((audio$sound[myseq[x]:(myseq[x] + windowLength_points - 1)]) ^ 2, na.rm = TRUE))
  })
  cond_silence = ampl >= silence
  framesToAnalyze = which(cond_silence)
  nf = length(framesToAnalyze)

  ## FORMANTS
  fmts = NULL
  no_formants = FALSE
  if (is.null(nFormants)) nFormants = 100
  # try one frame to see how many formants are returned
  fmts_list = vector('list', length = nf)
  if (nFormants > 0 & nf > 0) {
    # we don't really know how many formants will be returned by phonTools, so
    # we save everything at first, and then trim to nFormants
    for (i in 1:nf) {
      fmts_list[[i]] = try(suppressWarnings(do.call(
        phonTools::findformants,
        c(list(frameBank[, framesToAnalyze[i]],
               fs = audio$samplingRate, verify = FALSE),
          formants))),
        silent = TRUE)
      if (inherits(fmts_list[[i]], 'try-error')) {
        fmts_list[[i]] = data.frame(formant = NA, bandwidth = NA)[-1, ]
      }
    }
    # check how many formants we will/can save
    nFormants_avail = min(nFormants, max(unlist(lapply(fmts_list, nrow))))
    if (nFormants_avail > 0) {
      nFormants = nFormants_avail
      availableRows = 1:nFormants
      fmts = matrix(NA, nrow = ncol(frameBank), ncol = nFormants * 2)
      colnames(fmts) = paste0('f', rep(availableRows, each = 2),
                              rep(c('_freq', '_width'), nFormants))
      # iterate through the full formant list and save what's needed
      for (i in 1:nf) {
        ff = fmts_list[[i]]
        if (is.list(ff)) {
          nr = nrow(ff)
          if (nr < nFormants) {
            ff[(nr + 1):nFormants, ] = NA
          }
          temp = matrix(NA, nrow = nFormants, ncol = 2)
          temp[availableRows, ] = as.matrix(ff[availableRows, ])
          fmts[framesToAnalyze[i], ] = matrix(t(temp), nrow = 1)
        }
      }
    } else {
      no_formants = TRUE
    }
  } else if (nFormants > 0 && nf == 0) {
    no_formants = TRUE
  }
  if (no_formants) {
    # no formant analysis
    availableRows = 1:nFormants
    fmts = matrix(NA, nrow = ncol(frameBank), ncol = nFormants * 2)
    colnames(fmts) = paste0('f', rep(availableRows, each = 2),
                            rep(c('_freq', '_width'), nFormants))
  }
  out = cbind(
    data.frame(time = timestamps),
    as.data.frame(fmts)
  )
  return(out)
}

Try the soundgen package in your browser

Any scripts or data that you put into this service are public.

soundgen documentation built on Sept. 29, 2023, 5:09 p.m.