# build pdf manual (terminal): R CMD Rd2pdf "/home/allgoodguys/Documents/Studying/Lund_PhD/methods/sound-synthesis/soundgen"
# build vignettes: devtools::build_vignettes()
# devtools::build_win() # to check on windows
# To install from github (in RStudio):
# install.packages("devtools", "tuneR", "seewave", "phonTools", "zoo", "shiny", "shinyBS", "reshape2", "mvtnorm", "plyr", "dtw", "grid")
# library(devtools)
# install_github("tatters/soundgen")
# view vignettes:
# vignette(package = 'soundgen')
# vignette('sound_generation', package = 'soundgen')
# vignette('acoustic_analysis', package = 'soundgen')
# or, to open in a browser
# RShowDoc('sound_generation', package = 'soundgen')
# RShowDoc('acoustic_analysis', package = 'soundgen')
#' @import stats graphics utils grDevices
NULL
#' Generate a sound
#'
#' Generates a bout of one or more syllables with pauses between them. Two basic
#' components are synthesized: the harmonic component (the sum of sine waves
#' with frequencies that are multiples of the fundamental frequency) and the
#' noise component. Both components can be filtered with independently specified
#' formants. Intonation and amplitude contours can be applied both within each
#' syllable and across multiple syllables. Suggested application: synthesis of
#' animal or human non-linguistic vocalizations. For more information, see
#' \url{http://cogsci.se/soundgen.html} and the vignette on sound generation.
#' @param repeatBout the number of times the whole bout should be repeated
#' @param nSyl the number of syllables in the bout. Intonation, amplitude, and
#' formants contours span multiple syllables, but not multiple bouts (see
#' Details)
#' @param sylLen average duration of each syllable, ms
#' @param pauseLen average duration of pauses between syllables, ms
#' @param pitchAnchors a numeric vector of f0 values in Hz (assuming equal time
#' steps) or a dataframe specifying the time (ms) and value (Hz) of each
#' anchor. These anchors are used to create a smooth contour of fundamental
#' frequency f0 (pitch) within one syllable (see Examples)
#' @param pitchAnchorsGlobal unlike \code{pitchAnchors}, these anchors are used
#' to create a smooth contour of average f0 across multiple syllables. The
#' values are in semitones relative to the existing pitch, i.e. 0 = no change
#' @param temperature hyperparameter for regulating the amount of stochasticity
#' in sound generation
#' @param tempEffects a list of scale factors regulating the effect of
#' temperature on particular parameters. To change, specify just those pars
#' that you want to modify, don't rewrite the whole list (defaults are
#' hard-coded). \code{sylLenDep}: random variation of the duration of
#' syllables and pauses; \code{formDrift}: the amount of random drift of
#' formants; \code{formDisp}: irregularity of the dispersion of stochastic
#' formants; \code{pitchDriftDep}: amount of slow random drift of f0;
#' \code{pitchDriftFreq}: frequency of slow random drift of f0;
#' \code{pitchAnchorsDep, noiseAnchorsDep, amplAnchorsDep}: random
#' fluctuations of user-specified pitch / noise / amplitude anchors
#' @param maleFemale hyperparameter for shifting f0 contour, formants, and
#' vocalTract to make the speaker appear more male (-1...0) or more female
#' (0...+1)
#' @param creakyBreathy hyperparameter for a rough adjustment of voice quality
#' from creaky (-1) to breathy (+1)
#' @param nonlinBalance hyperparameter for regulating the (approximate)
#' proportion of sound with different regimes of pitch effects (none /
#' subharmonics only / subharmonics and jitter). 0\% = no noise; 100\% = the
#' entire sound has jitter + subharmonics. Ignored if temperature = 0
#' @param nonlinDep hyperparameter for regulating the intensity of
#' subharmonics and jitter, 0 to 100\% (50\% = jitter and subharmonics are as
#' specified, <50\% weaker, >50\% stronger). Ignored if temperature = 0
#' @param jitterLen duration of stable periods between pitch jumps, ms. Use a
#' low value for harsh noise, a high value for irregular vibrato or shaky
#' voice
#' @param jitterDep cycle-to-cycle random pitch variation, semitones
#' @param vibratoFreq the rate of regular pitch modulation, or vibrato, Hz
#' @param vibratoDep the depth of vibrato, semitones
#' @param shimmerDep random variation in amplitude between individual glottal
#' cycles (0 to 100\% of original amplitude of each cycle)
#' @param attackLen duration of fade-in / fade-out at each end of syllables and
#' noise (ms)
#' @param rolloff basic rolloff at a constant rate of \code{rolloff} db/octave
#' (exponential decay). See \code{\link{getRolloff}} for more details
#' @param rolloffOct basic rolloff changes from lower to upper
#' harmonics (regardless of f0) by \code{rolloffOct} dB/oct. For
#' example, we can get steeper rolloff in the upper part of the spectrum
#' @param rolloffParab an optional quadratic term affecting only the
#' first \code{rolloffParabHarm} harmonics. The middle harmonic
#' of the first \code{rolloffParabHarm} harmonics is amplified or
#' dampened by \code{rolloffParab} dB relative to the basic
#' exponential decay.
#' @param rolloffParabHarm the number of harmonics affected by
#' \code{rolloffParab}
#' @param rolloffKHz rolloff changes linearly with f0 by
#' \code{rolloffKHz} dB/kHz. For ex., -6 dB/kHz gives a 6 dB
#' steeper basic rolloff as f0 goes up by 1000 Hz
#' @param rolloffLip the effect of lip radiation on source spectrum, dB/oct
#' (the default of +6 dB/oct produces a high-frequency boost when the mouth is
#' open)
#' @param formants either a character string like "aaui" referring to default
#' presets for speaker "M1" or a list of formant times, frequencies,
#' amplitudes, and bandwidths (see ex. below). \code{formants = NA} defaults
#' to schwa. Time stamps for formants and mouthOpening can be specified in ms
#' or an any other arbitarary scale. See \code{\link{getSpectralEnvelope}} for more details
#' @param formantDep scale factor of formant amplitude (1 = no change relative
#' to amplitudes in \code{formants})
#' @param formantDepStoch the amplitude of additional stochastic formants added above
#' the highest specified formant, dB (only if temperature > 0)
#' @param vocalTract the length of vocal tract, cm. Used for calculating formant
#' dispersion (for adding extra formants) and formant transitions as the mouth
#' opens and closes
#' @param subFreq target frequency of subharmonics, Hz (lower than f0, adjusted
#' dynamically so f0 is always a multiple of subFreq)
#' @param subDep the width of subharmonic band, Hz. Regulates how quickly the
#' strength of subharmonics fades as they move away from harmonics in f0
#' stack. Low values produce narrow sidebands, high values produce uniformly
#' strong subharmonics
#' @param shortestEpoch minimum duration of each epoch with unchanging
#' subharmonics regime, in ms
#' @param amDep amplitude modulation depth, %. 0: no change; 100: amplitude
#' modulation with amplitude range equal to the dynamic range of the sound
#' @param amFreq amplitude modulation frequency, Hz
#' @param amShape amplitude modulation shape (-1 to +1, defaults to 0)
#' @param noiseAnchors a numeric vector of noise amplitudes (-120 dB = none, 0
#' dB = as loud as voiced component) or a dataframe specifying the time (ms)
#' and amplitude (dB) of anchors for generating the noise component such as
#' aspiration, hissing, etc
#' @param formantsNoise the same as \code{formants}, but for the
#' noise component instead of the harmonic component. If NA (default), the
#' noise component will be filtered through the same formants as the harmonic
#' component, approximating aspiration noise [h]
#' @param rolloffNoise rolloff of noise, dB/octave. It is analogous to
#' \code{rolloff}, but while \code{rolloff} applies to the harmonic component,
#' \code{rolloffNoise} applies to the noise component
#' @param mouthAnchors a numeric vector of mouth opening (0 to 1, 0.5 = neutral,
#' i.e. no modification) or a dataframe specifying the time (ms) and value of
#' mouth opening
#' @param amplAnchors a numeric vector of amplitude envelope (0 to 1) or a
#' dataframe specifying the time (ms) and value of amplitude anchors
#' @param amplAnchorsGlobal a numeric vector of global amplitude envelope
#' spanning multiple syllables or a dataframe specifying the time (ms) and
#' value (0 to 1) of each anchor
#' @param samplingRate sampling frequency, Hz
#' @param windowLength length of FFT window, ms
#' @param overlap FFT window overlap, \%
#' @param addSilence silence before and after the bout, ms
#' @param pitchFloor,pitchCeiling lower & upper bounds of f0
#' @param pitchSamplingRate sampling frequency of the pitch contour only, Hz. Low
#' values reduce processing time. A rule of thumb is to set this to
#' the same value as \code{pitchCeiling}
#' @param throwaway discard harmonics and noise that are quieter than this
#' number (in dB, defaults to -120) to save computational resources
#' @param invalidArgAction what to do if an argument is invalid or outside the
#' range in \code{permittedValues}: 'adjust' = reset to default value, 'abort'
#' = stop execution, 'ignore' = throw a warning and continue (may crash)
#' @param plot if TRUE, plots a spectrogram
#' @param play if TRUE, plays the synthesized sound. In case of errors, try
#' setting another default player for \code{\link[tuneR]{play}}
#' @param savePath full path for saving the output, e.g. '~/Downloads/temp.wav'.
#' If NA (default), doesn't save anything
#' @param ... other plotting parameters passed to \code{\link{spectrogram}}
#' @export
#' @return Returns the synthesized waveform as a numeric vector.
#' @examples
#' # NB: GUI for soundgen is available as a Shiny app.
#' # Type "soundgen_app()" to start it
#'
#' playback = c(TRUE, FALSE)[2] # set to TRUE to play back the audio from examples
#'
#' sound = soundgen(play = playback)
#' # spectrogram(sound, 16000, osc = TRUE)
#' # playme(sound)
#'
#' # Use the in-built collection of presets:
#' # names(presets) # speakers
#' # names(presets$Chimpanzee) # calls per speaker
#' s1 = eval(parse(text = presets$Chimpanzee$Scream_conflict)) # screaming chimp
#' # playme(s1)
#' s2 = eval(parse(text = presets$F1$Scream_conflict))
#' # playme(s2)
#' # unless temperature is 0, the sound is different every time
#' for (i in 1:3) sound = soundgen(play = playback, temperature = .2)
#'
#' # Bouts versus syllables. Compare:
#' sound = soundgen(formants = 'uai', repeatBout = 3, play = playback)
#' sound = soundgen(formants = 'uai', nSyl = 3, play = playback)
#'
#' # Intonation contours per syllable and globally:
#' sound = soundgen(nSyl = 5, sylLen = 200, pauseLen = 140,
#' play = playback, pitchAnchors = data.frame(
#' time = c(0, 0.65, 1), value = c(977, 1540, 826)),
#' pitchAnchorsGlobal = data.frame(time = c(0, .5, 1), value = c(-6, 7, 0)))
#'
#' # Subharmonics in sidebands (noisy scream)
#' sound = soundgen (nonlinBalance = 100, subFreq = 75, subDep = 130,
#' pitchAnchors = data.frame(
#' time = c(0, .3, .9, 1), value = c(1200, 1547, 1487, 1154)),
#' sylLen = 800,
#' play = playback, plot = TRUE)
#'
#' # Jitter and mouth opening (bark, dog-like)
#' sound = soundgen(repeatBout = 2, sylLen = 160, pauseLen = 100,
#' nonlinBalance = 100, subFreq = 100, subDep = 60, jitterDep = 1,
#' pitchAnchors = data.frame(time = c(0, 0.52, 1), value = c(559, 785, 557)),
#' mouthAnchors = data.frame(time = c(0, 0.5, 1), value = c(0, 0.5, 0)),
#' vocalTract = 5, play = playback)
soundgen = function(repeatBout = 1,
nSyl = 1,
sylLen = 300,
pauseLen = 200,
pitchAnchors = data.frame(time = c(0, .1, .9, 1),
value = c(100, 150, 135, 100)),
pitchAnchorsGlobal = NA,
temperature = 0.025,
tempEffects = list(
sylLenDep = .02,
formDrift = .3,
formDisp = .2,
pitchDriftDep = .5,
pitchDriftFreq = .125,
pitchAnchorsDep = .05,
noiseAnchorsDep = .1,
amplAnchorsDep = .1
),
maleFemale = 0,
creakyBreathy = 0,
nonlinBalance = 0,
nonlinDep = 50,
jitterLen = 1,
jitterDep = 3,
vibratoFreq = 5,
vibratoDep = 0,
shimmerDep = 0,
attackLen = 50,
rolloff = -12,
rolloffOct = -12,
rolloffKHz = -6,
rolloffParab = 0,
rolloffParabHarm = 3,
rolloffLip = 6,
formants = list(f1 = list(time = 0, freq = 860,
amp = 30, width = 120),
f2 = list(time = 0, freq = 1280,
amp = 40, width = 120),
f3 = list(time = 0, freq = 2900,
amp = 25, width = 200)),
formantDep = 1,
formantDepStoch = 30,
vocalTract = 15.5,
subFreq = 100,
subDep = 100,
shortestEpoch = 300,
amDep = 0,
amFreq = 30,
amShape = 0,
noiseAnchors = data.frame(time = c(0, 300),
value = c(-120, -120)),
formantsNoise = NA,
rolloffNoise = -14,
mouthAnchors = data.frame(time = c(0, 1),
value = c(.5, .5)),
amplAnchors = NA,
amplAnchorsGlobal = NA,
samplingRate = 16000,
windowLength = 50,
overlap = 75,
addSilence = 100,
pitchFloor = 50,
pitchCeiling = 3500,
pitchSamplingRate = 3500,
throwaway = -120,
invalidArgAction = c('adjust', 'abort', 'ignore')[1],
plot = FALSE,
play = FALSE,
savePath = NA,
...) {
# check that values of numeric arguments are valid and within range
for (p in rownames(permittedValues)[1:which(
rownames(permittedValues) == 'rolloffNoise'
)]) {
if (!is.numeric(get(p)) ||
get(p) < permittedValues[p, 'low'] ||
get(p) > permittedValues[p, 'high']) {
if (invalidArgAction == 'abort') {
# exit with a warning
stop(paste(p, 'must be between',
permittedValues[p, 'low'],
'and',
permittedValues[p, 'high']))
} else if (invalidArgAction == 'ignore') {
# throw a warning and continue
warning(paste(p, "outside its range in 'permittedValues'"))
} else {
# reset p to default, with a warning
assign(noquote(p), permittedValues[p, 'default'])
warning(paste0(p, " outside permitted range, reset to ", get(p),
". Edit 'permittedValues' or use ",
"invalidArgAction = 'ignore' to force."))
}
}
}
# convert numeric anchors to dataframes
for (anchor in c('pitchAnchors', 'pitchAnchorsGlobal',
'amplAnchors', 'amplAnchorsGlobal', 'mouthAnchors')) {
if (is.numeric(get(anchor)) && length(get(anchor)) > 0) {
assign(anchor, data.frame(time = seq(0, 1, length.out = length(get(anchor))),
value = get(anchor)))
}
}
if (is.numeric(noiseAnchors) && length(noiseAnchors) > 0) {
noiseAnchors = data.frame(time = seq(0, sylLen, length.out = length(noiseAnchors)),
value = noiseAnchors)
}
windowLength_points = floor(windowLength / 1000 * samplingRate / 2) * 2
# defaults of tempEffects
if (!is.numeric(tempEffects$formDrift)) tempEffects$formDrift = .3
if (!is.numeric(tempEffects$formDisp)) tempEffects$formDisp = .2
if (!is.numeric(tempEffects$pitchDriftDep)) tempEffects$pitchDriftDep = .5
if (!is.numeric(tempEffects$pitchDriftFreq)) tempEffects$pitchDriftFreq = .125
if (!is.numeric(tempEffects$pitchAnchorsDep)) tempEffects$pitchAnchorsDep = .05
if (!is.numeric(tempEffects$noiseAnchorsDep)) tempEffects$noiseAnchorsDep = .1
if (!is.numeric(tempEffects$amplAnchorsDep)) tempEffects$amplAnchorsDep = .1
# force anchor lists to dataframe
if (class(pitchAnchors) == 'list') pitchAnchors = as.data.frame(pitchAnchors)
if (class(pitchAnchorsGlobal) == 'list') pitchAnchorsGlobal = as.data.frame(pitchAnchorsGlobal)
if (class(amplAnchors) == 'list') amplAnchors = as.data.frame(amplAnchors)
if (class(amplAnchorsGlobal) == 'list') amplAnchorsGlobal = as.data.frame(amplAnchorsGlobal)
if (class(mouthAnchors) == 'list') mouthAnchors = as.data.frame(mouthAnchors)
if (class(noiseAnchors) == 'list') noiseAnchors = as.data.frame(noiseAnchors)
# adjust parameters according to the specified hyperparameters
if (creakyBreathy < 0) {
# for creaky voice
nonlinBalance = min(100, nonlinBalance - creakyBreathy * 50)
jitterDep = max(0, jitterDep - creakyBreathy / 2)
shimmerDep = max(0, shimmerDep - creakyBreathy * 5)
subDep = subDep * 2 ^ (-creakyBreathy)
} else if (creakyBreathy > 0) {
# for breathy voice, add breathing
noiseAnchors = data.frame(time = c(0, sylLen + 100),
value = c(-120, -120))
noiseAnchors$value = noiseAnchors$value + creakyBreathy * 160
noiseAnchors$value[noiseAnchors$value >
permittedValues['noiseAmpl', 'high']] =
permittedValues['noiseAmpl', 'high']
}
# adjust rolloff for both creaky and breathy voices
rolloff = rolloff - creakyBreathy * 10
rolloffOct = rolloffOct - creakyBreathy * 5
subFreq = 2 * (subFreq - 50) / (1 + exp(-.1 * (50 - nonlinDep))) + 50
# subFreq unchanged for nonlinDep=50%, raised for lower and
# lowered for higher noise intensities. Max set at 2*subFreq-50, min at 50 Hz.
# Illustration: subFreq=250; nonlinDep=0:100; plot(nonlinDep,
# 2 * (subFreq - 50) / (1 + exp(-.1 * (50 - nonlinDep))) + 50, type = 'l')
jitterDep = 2 * jitterDep / (1 + exp(.1 * (50 - nonlinDep)))
# Illustration: jitterDep = 1.5; nonlinDep = 0:100;
# plot(nonlinDep, 2 * jitterDep / (1 + exp(.1 * (50 - nonlinDep))),
# type = 'l')
if (maleFemale != 0) {
# adjust pitch and formants along the male-female dimension
# pitch varies by 1 octave up or down
if (is.list(pitchAnchors)) {
pitchAnchors$value = pitchAnchors$value * 2 ^ maleFemale
}
if (is.list(formants)) {
for (f in 1:length(formants)) {
# formants vary by 25% up or down:
# see http://www.santiagobarreda.com/vignettes/v1/v1.html)
formants[[f]]$freq = formants[[f]]$freq * 1.25 ^ maleFemale
}
}
# vocalTract varies by ±25% from the average
vocalTract = vocalTract * (1 - .25 * maleFemale)
}
# soundgen() normally expects a list of formant values,
# but a string is also ok for demonstration purposes
# (dictionary for caller 1 is used to interpret)
if (class(formants) == 'character') {
formants = convertStringToFormants(formants)
}
if (class(formants[[1]]) == 'list') {
formants = lapply(formants, as.data.frame)
}
# stochastic rounding of the number of syllables and repeatBouts
# (eg for nSyl = 2.5, we'll have 2 or 3 syllables with equal probs)
# NB: this is very useful for morphing
if (!is.integer(nSyl)) {
nSyl = floor(nSyl) + rbinom(1, 1, nSyl - floor(nSyl))
}
if (!is.integer(repeatBout)) {
repeatBout = floor(repeatBout) +
rbinom(1, 1, repeatBout - floor(repeatBout))
}
# prepare a list of pars for calling generateHarmonics()
pars_to_vary = c(
'nonlinDep',
'attackLen',
'jitterDep',
'shimmerDep',
'rolloff',
'rolloffOct',
'shortestEpoch',
'subFreq',
'subDep'
)
# don't add nonlinBalance, otherwise there is no simple way to remove noise at temp>0
pars_to_round = c('attackLen', 'subFreq', 'subDep')
pars_list = list(
'attackLen' = attackLen,
'jitterDep' = jitterDep,
'jitterLen' = jitterLen,
'vibratoFreq' = vibratoFreq,
'vibratoDep' = vibratoDep,
'shimmerDep' = shimmerDep,
'creakyBreathy' = creakyBreathy,
'rolloff' = rolloff,
'rolloffOct' = rolloffOct,
'rolloffKHz' = rolloffKHz,
'rolloffParab' = rolloffParab,
'rolloffParabHarm' = rolloffParabHarm,
'temperature' = temperature,
'pitchDriftDep' = tempEffects$pitchDriftDep,
'pitchDriftFreq' = tempEffects$pitchDriftFreq,
'shortestEpoch' = shortestEpoch,
'subFreq' = subFreq,
'subDep' = subDep,
'rolloffLip' = rolloffLip,
'amDep' = amDep,
'amFreq' = amFreq,
'nonlinBalance' = nonlinBalance,
'nonlinDep' = nonlinDep,
'pitchFloor' = pitchFloor,
'pitchCeiling' = pitchCeiling,
'pitchSamplingRate' = pitchSamplingRate,
'throwaway' = throwaway,
'samplingRate' = samplingRate,
'overlap' = overlap
)
pars_syllable = pars_list
if (is.list(pitchAnchorsGlobal) &&
any(!is.na(pitchAnchorsGlobal)) &&
any(pitchAnchorsGlobal$value != 0) &&
nSyl > 1) {
pitchDeltas = 2 ^ (
getDiscreteContour(
len = nSyl,
anchors = pitchAnchorsGlobal,
method = 'spline',
plot = FALSE
) / 12
)
} else {
pitchDeltas = rep(1, nSyl)
}
# make sure pitchAnchors$time range from 0 to 1
if (is.list(pitchAnchors)) {
if (min(pitchAnchors$time) < 0) {
pitchAnchors$time = pitchAnchors$time - min(pitchAnchors$time)
}
if (max(pitchAnchors$time) > 1) {
pitchAnchors$time = pitchAnchors$time / max(pitchAnchors$time)
}
}
wiggleNoise = temperature > 0 &&
class(noiseAnchors) == 'data.frame' &&
sum(noiseAnchors$value > throwaway) > 0
wiggleAmpl_per_syl = temperature > 0 &&
!is.na(amplAnchors) &&
sum(amplAnchors$value < -throwaway) > 0
# START OF BOUT GENERATION
for (b in 1:repeatBout) {
# syllable segmentation
if (sylLen >= permittedValues['sylLen', 'low'] && sylLen <= permittedValues['sylLen', 'high']) {
sylDur_s = rnorm_bounded(
n = 1,
mean = sylLen,
low = permittedValues['sylLen', 'low'],
high = permittedValues['sylLen', 'high'],
sd = (permittedValues['sylLen', 'high'] -
permittedValues['sylLen', 'low']) * temperature * tempEffects$sylLenDep,
roundToInteger = FALSE
)
} else {
sylDur_s = sylLen # don't try to wiggle weird values, esp. 0
}
pauseDur_s = rnorm_bounded(
n = 1,
mean = pauseLen,
low = permittedValues['pauseLen', 'low'],
high = permittedValues['pauseLen', 'high'],
sd = (permittedValues['pauseLen', 'high'] -
permittedValues['pauseLen', 'low']) * temperature * tempEffects$sylLenDep,
roundToInteger = FALSE
)
syllables = divideIntoSyllables(
sylLen = sylDur_s,
nSyl = nSyl,
pauseLen = pauseDur_s,
sylDur_min = permittedValues['sylLen', 'low'],
sylDur_max = permittedValues['sylLen', 'high'],
pauseDur_min = permittedValues['pauseLen', 'low'],
pauseDur_max = permittedValues['pauseLen', 'high'],
temperature = temperature * tempEffects$sylLenDep
)
syllableStartIdx = round(syllables[, 'start'] * samplingRate / 1000)
syllableStartIdx[1] = 1
# if noise is added before the voiced part of each syllable
# (negative time anchors) or starts later than the voiced part,
# we need to shift noise insertion points
if (!is.na(noiseAnchors) && noiseAnchors$time[1] != 0) {
shift = -round(noiseAnchors$time[1] * samplingRate / 1000)
if (noiseAnchors$time[1] < 0) {
# only the first syllableStartIdx is shifted, because that changes
# the sound length and the remaining syllableStartIdx, if any,
# are already shifted appropriately
syllableStartIdx[1] = syllableStartIdx - shift
} else {
syllableStartIdx = syllableStartIdx - shift # shift for each syllable
}
}
# end of syllable segmentation
# START OF SYLLABLE GENERATION
voiced = vector()
unvoiced = list()
noiseAnchors_syl = list()
for (s in 1:nrow(syllables)) {
# wiggle par values for this particular syllable, making sure
# they are within the permitted range for each variable
pitchAnchors_per_syl = pitchAnchors
amplAnchors_per_syl = amplAnchors
if (temperature > 0) {
# OR if (temperature>0 & nrow(syllables)>1)
# if you don't want to mess with single-syllable vocalizations
for (p in 1:length(pars_to_vary)) {
l = permittedValues[pars_to_vary[p], 'low']
h = permittedValues[pars_to_vary[p], 'high']
pars_syllable[pars_to_vary[p]] = rnorm_bounded(
n = 1,
mean = as.numeric(pars_list[pars_to_vary[p]]),
low = l,
high = h,
sd = (h - l) * temperature / 10,
roundToInteger = (pars_to_vary[p] %in% pars_to_round)
)
# /10 to have less variation in the spectral pars vs.
# duration of separate syllables
}
if (is.list(pitchAnchors_per_syl)) {
pitchAnchors_per_syl = wiggleAnchors(
df = pitchAnchors_per_syl,
temperature = temperature,
low = c(0, permittedValues['pitch', 'low']),
high = c(1, permittedValues['pitch', 'high']),
temp_coef = tempEffects$pitchAnchorsDep
)
}
if (wiggleNoise) {
noiseAnchors_syl[[s]] = wiggleAnchors(
df = noiseAnchors,
temperature = temperature,
low = c(-Inf, permittedValues['noiseAmpl', 'low']),
high = c(+Inf, permittedValues['noiseAmpl', 'high']),
wiggleAllRows = TRUE,
temp_coef = tempEffects$noiseAnchorsDep
)
}
if (wiggleAmpl_per_syl) {
amplAnchors_per_syl = wiggleAnchors(
df = amplAnchors_per_syl,
temperature = temperature,
low = c(0, 0),
high = c(1,-throwaway),
temp_coef = tempEffects$amplAnchorsDep
)
}
}
# generate smooth pitch contour for this particular syllable
dur_syl = as.numeric(syllables[s, 'end'] - syllables[s, 'start'])
if (is.list(pitchAnchors_per_syl) | is.numeric(pitchAnchors_per_syl)) {
pitchContour_syl = getSmoothContour(
anchors = pitchAnchors_per_syl,
len = round(dur_syl * pitchSamplingRate / 1000),
samplingRate = pitchSamplingRate,
valueFloor = pitchFloor,
valueCeiling = pitchCeiling,
thisIsPitch = TRUE
) * pitchDeltas[s]
# plot(pitchContour_syl, type = 'l')
}
# generate the voiced part
if (dur_syl < permittedValues['sylLen', 'low'] |
(!is.na(noiseAnchors) && min(noiseAnchors$value) >= 40) |
!is.list(pitchAnchors_per_syl)) {
# only synthesize voiced part if noise is weaker than 40 dB
# and the voiced part is long enough to bother synthesizing it
syllable = rep(0, round(dur_syl * samplingRate / 1000))
} else {
# the actual synthesis is here
syllable = try(do.call(generateHarmonics, c(
pars_syllable,
list(pitch = pitchContour_syl, amplAnchors = amplAnchors_per_syl)
))
)
}
# spectrogram(syllable, samplingRate = samplingRate)
# playme(syllable, samplingRate = samplingRate)
if (class(syllable) == 'try-error') {
stop('Failed to generate the new syllable!')
}
if (sum(is.na(syllable)) > 0) {
stop('The new syllable contains NA values!')
}
# generate pause for all but the last syllable
if (s < nrow(syllables)) {
pause = rep(0, floor((syllables[s + 1, 1] - syllables[s, 2]) *
samplingRate / 1000))
} else {
pause = numeric()
}
# add syllable and pause to the growing sound
voiced = c(voiced, syllable, pause)
# generate the unvoiced part, but don't add it to the sound just yet
if (!is.na(noiseAnchors) &&
sum(noiseAnchors$value > throwaway) > 0) {
# adjust noiseAnchors$time to match the actual syllable duration
noiseAnchors_syl[[s]] = noiseAnchors
noiseAnchors_syl[[s]]$time[noiseAnchors_syl[[s]]$time > 0] =
noiseAnchors_syl[[s]]$time[noiseAnchors_syl[[s]]$time > 0] *
dur_syl / sylLen
# negative time anchors are not changed: the pre-aspiration length
# is constant, regardless of the actual syllable duration.
# However, positive time anchors are proportional to the actual
# syllable duration re the average expected duration (which the user
# sees in the UI when choosing time anchors)
unvoicedDur_syl = round(diff(range(noiseAnchors_syl[[s]]$time)) *
samplingRate / 1000)
# calculate noise spectrum
if (is.null(formantsNoise) || is.na(formantsNoise[1])) {
spectralEnvelopeNoise = NA
} else {
movingFormants = max(unlist(lapply(formantsNoise, length))) > 1 |
sum(mouthAnchors$value != .5) > 0 # are noise formants moving, as opposed to constant?
nInt = ifelse(movingFormants,
round(diff(range(noiseAnchors_syl[[s]]$time)) / 10),
1) # the number of different noise spectra,
# allowing one column (noise spectrum) per 10 ms of audio
spectralEnvelopeNoise = getSpectralEnvelope(
nr = windowLength_points / 2,
nc = nInt,
formants = formantsNoise,
formantDep = formantDep,
formantDepStoch = formantDepStoch,
rolloffLip = rolloffLip,
mouthAnchors = mouthAnchors,
temperature = temperature,
formDrift = tempEffects$formDrift,
formDisp = tempEffects$formDisp,
samplingRate = samplingRate,
vocalTract = vocalTract
)
# image(t(spectralEnvelopeNoise))
}
# synthesize the unvoiced part
unvoiced[[s]] = generateNoise(
len = unvoicedDur_syl,
noiseAnchors = noiseAnchors_syl[[s]],
rolloffNoise = rolloffNoise,
attackLen = attackLen,
samplingRate = samplingRate,
windowLength_points = windowLength_points,
overlap = overlap,
throwaway = throwaway,
filterNoise = spectralEnvelopeNoise
)
# plot(unvoiced[[s]], type = 'l')
}
}
# plot(voiced, type = 'l')
# spectrogram(voiced, samplingRate = samplingRate, osc = TRUE)
# playme(voiced, samplingRate = samplingRate)
# END OF SYLLABLE GENERATION
# if the unvoiced noise is of type "breathing" (the same formants as in
# the voiced part), we mix voiced+unvoiced BEFORE filtering the sound,
# otherwise we filter first and then mix voiced+unvoiced
sound = voiced
if (length(unvoiced) > 0 && (is.null(formantsNoise) || is.na(formantsNoise))) {
for (s in 1:length(unvoiced)) {
sound = addVectors(sound, unvoiced[[s]],
insertionPoint = syllableStartIdx[s])
}
}
# plot(sound, type = 'l')
# spectrogram(sound, samplingRate = samplingRate)
# playme(sound, samplingRate = samplingRate)
# for polysyllabic vocalizations, apply amplitude envelope (if specified)
# over the entire bout and normalize to -1...+1
if (!is.na(amplAnchorsGlobal) &&
length(which(amplAnchorsGlobal$value < -throwaway)) > 0) {
# convert from dB to linear multiplier
amplAnchorsGlobal$value = 2 ^ (amplAnchorsGlobal$value / 10)
amplEnvelope = getSmoothContour(
anchors = amplAnchorsGlobal,
len = length(sound),
valueFloor = 0,
valueCeiling = -throwaway,
samplingRate = samplingRate
) # plot(amplEnvelope)
sound = sound * amplEnvelope
}
# prepare vocal tract filter (formants + some spectral noise + lip radiation)
if (sum(sound) == 0) {
# ie if we didn't synthesize a voiced syllable (unvoiced part only) -
# otherwise fft glitches
soundFiltered = sound
} else {
# for very short sounds, make sure the analysis window is no more
# than half the sound's length
windowLength_points = min(windowLength_points, floor(length(sound) / 2))
step = seq(1,
max(1, (length(sound) - windowLength_points)),
windowLength_points - (overlap * windowLength_points / 100))
nc = length(step) # number of windows for fft
nr = windowLength_points / 2 # number of frequency bins for fft
# are formants moving or stationary?
if (is.list(formants)) {
movingFormants = max(sapply(formants, function(x) sapply(x, length))) > 1
} else {
movingFormants = FALSE
}
if (is.list(mouthAnchors) && sum(mouthAnchors$value != .5) > 0) {
movingFormants = TRUE
}
nInt = ifelse(movingFormants, nc, 1)
# prepare the filter
spectralEnvelope = getSpectralEnvelope(
nr = nr,
nc = nInt,
formants = formants,
formantDep = formantDep,
formantDepStoch = formantDepStoch,
rolloffLip = rolloffLip,
mouthAnchors = mouthAnchors,
temperature = temperature,
formDrift = tempEffects$formDrift,
formDisp = tempEffects$formDisp,
samplingRate = samplingRate,
vocalTract = vocalTract
)
# image(t(spectralEnvelope))
# fft and filtering
z = seewave::stft(
wave = as.matrix(sound),
f = samplingRate,
wl = windowLength_points,
zp = 0,
step = step,
wn = 'hamming',
fftw = FALSE,
scale = TRUE,
complex = TRUE
)
if (movingFormants) {
z = z * spectralEnvelope
} else {
z = apply (z, 2, function(x)
x * spectralEnvelope)
}
# inverse fft
soundFiltered = as.numeric(
seewave::istft(
z,
f = samplingRate,
ovlp = overlap,
wl = windowLength_points,
output = "matrix"
)
)
soundFiltered = soundFiltered / max(soundFiltered) # normalize
}
# spectrogram(soundFiltered, samplingRate = samplingRate)
# playme(soundFiltered, samplingRate = samplingRate)
# add the separately filtered noise back into the sound at the appropriate time points AFTER filtering the sound
if (length(unvoiced) > 0 && is.list(formantsNoise)) {
for (s in 1:length(unvoiced)) {
soundFiltered = addVectors(soundFiltered, unvoiced[[s]],
insertionPoint = syllableStartIdx[s])
}
} # plot(soundFiltered, type = 'l')
# trill - rapid regular amplitude modulation
if (amDep > 0) {
# trill = 1 - sin(2 * pi * (1:length(soundFiltered)) /
# samplingRate * amFreq) * amDep / 100
sig = getSigmoid(len = length(soundFiltered),
samplingRate = samplingRate,
freq = amFreq,
shape = amShape)
trill = 1 - sig * amDep / 100
# plot(trill, type='l')
} else {
trill = 1
}
soundFiltered = soundFiltered * trill
# grow bout
if (b == 1) {
bout = soundFiltered
} else {
bout = c(bout,
rep(0, pauseLen * samplingRate / 1000),
soundFiltered)
}
}
# add some silence before and after the entire bout
if (!is.na(addSilence)) {
n = round(samplingRate / 1000 * addSilence)
bout = c(rep(0, n), bout, rep(0, n))
}
if (play) {
playme(bout, samplingRate = samplingRate)
# spectrogram((soundFiltered, samplingRate = samplingRate, osc = TRUE)
}
if (!is.na(savePath)) {
seewave::savewav(bout, filename = savePath, f = samplingRate)
}
if(plot) {
spectrogram(bout, samplingRate = samplingRate, ...)
}
return(bout)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.