analyze  R Documentation 
Acoustic analysis of one or more sounds: pitch tracking, basic spectral
characteristics, formants, estimated loudness (see
getLoudness
), roughness (see modulationSpectrum
),
novelty (see ssm
), etc. The default values of arguments are
optimized for human nonlinguistic vocalizations. See
vignette('acoustic_analysis', package = 'soundgen') for details. The defaults
and reasonable ranges of all arguments can be found in
defaults_analyze
. For highprecision work, first extract and
manually correct pitch contours with pitch_app
, PRAAT, or
whatever, and then run analyze(pitchManual = ...)
with these manual
contours.
analyze(
x,
samplingRate = NULL,
scale = NULL,
from = NULL,
to = NULL,
dynamicRange = 80,
silence = 0.04,
windowLength = 50,
step = 25,
overlap = 50,
specType = c("spectrum", "reassign", "spectralDerivative")[1],
wn = "gaussian",
zp = 0,
cutFreq = NULL,
nFormants = 3,
formants = list(),
loudness = list(SPL_measured = 70),
roughness = list(windowLength = 15, step = 3, amRes = 10),
novelty = list(input = "melspec", kernelLen = 100),
pitchMethods = c("dom", "autocor"),
pitchManual = NULL,
entropyThres = 0.6,
pitchFloor = 75,
pitchCeiling = 3500,
priorMean = 300,
priorSD = 6,
priorAdapt = TRUE,
nCands = 1,
minVoicedCands = NULL,
pitchDom = list(domThres = 0.1, domSmooth = 220),
pitchAutocor = list(autocorThres = 0.7, autocorSmooth = 7, autocorUpsample = 25,
autocorBestPeak = 0.975),
pitchCep = list(cepThres = 0.75, cepZp = 0),
pitchSpec = list(specThres = 0.05, specPeak = 0.25, specHNRslope = 0.8, specSmooth =
150, specMerge = 0.1, specSinglePeakCert = 0.4, specRatios = 3),
pitchHps = list(hpsNum = 5, hpsThres = 0.1, hpsNorm = 2, hpsPenalty = 2),
pitchZc = list(zcThres = 0.1, zcWin = 5),
harmHeight = list(harmThres = 3, harmTol = 0.25, harmPerSel = 5),
subh = list(method = c("cep", "pitchCands", "harm")[1], nSubh = 5, tol = 0.05, nHarm =
5, harmThres = 12, harmTol = 0.25, amRange = c(10, 200)),
flux = list(thres = 0.15, smoothWin = 100),
amRange = c(10, 200),
fmRange = c(5, 1000/step/2),
shortestSyl = 20,
shortestPause = 60,
interpol = list(win = 75, tol = 0.3, cert = 0.3),
pathfinding = c("none", "fast", "slow")[2],
annealPars = list(maxit = 5000, temp = 1000),
certWeight = 0.5,
snakeStep = 0,
snakePlot = FALSE,
smooth = 1,
smoothVars = c("pitch", "dom"),
summaryFun = c("mean", "median", "sd"),
invalidArgAction = c("adjust", "abort", "ignore")[1],
reportEvery = NULL,
cores = 1,
plot = FALSE,
osc = "linear",
showLegend = TRUE,
savePlots = NULL,
pitchPlot = list(col = rgb(0, 0, 1, 0.75), lwd = 3, showPrior = TRUE),
extraContour = NULL,
ylim = NULL,
xlab = "Time",
ylab = NULL,
main = NULL,
width = 900,
height = 500,
units = "px",
res = NA,
...
)
x 
path to a folder, one or more wav or mp3 files c('file1.wav', 'file2.mp3'), Wave object, numeric vector, or a list of Wave objects or numeric vectors 
samplingRate 
sampling rate of 
scale 
maximum possible amplitude of input used for normalization of
input vector (only needed if 
from, to 
if NULL (default), analyzes the whole sound, otherwise from...to (s) 
dynamicRange 
dynamic range, dB. All values more than one dynamicRange under maximum are treated as zero 
silence 
(0 to 1 as proportion of max amplitude) frames with RMS
amplitude below 
windowLength 
length of FFT window, ms 
step 
you can override 
overlap 
overlap between successive FFT frames, % 
specType 
plot the original FFT ('spectrum'), reassigned spectrogram ('reassigned'), or spectral derivative ('spectralDerivative') 
wn 
window type accepted by 
zp 
window length after zero padding, points 
cutFreq 
if specified, spectral descriptives (peakFreq, specCentroid,
specSlope, and quartiles) are calculated only between 
nFormants 
the number of formants to extract per STFT frame (0 = no formant analysis, NULL = as many as possible) 
formants 
a list of arguments passed to

loudness 
a list of parameters passed to 
roughness 
a list of parameters passed to

novelty 
a list of parameters passed to 
pitchMethods 
methods of pitch estimation to consider for determining pitch contour: 'autocor' = autocorrelation (~PRAAT), 'cep' = cepstral, 'spec' = spectral (~BaNa), 'dom' = lowest dominant frequency band, 'hps' = harmonic product spectrum, NULL = no pitch analysis 
pitchManual 
manually corrected pitch contour. For a single sound,
provide a numeric vector of any length. For multiple sounds, provide a
dataframe with columns "file" and "pitch" (or path to a csv file) as
returned by 
entropyThres 
pitch tracking is only performed for frames with Weiner
entropy below 
pitchFloor, pitchCeiling 
absolute bounds for pitch candidates (Hz) 
priorMean, priorSD 
specifies the mean (Hz) and standard deviation
(semitones) of gamma distribution describing our prior knowledge about the
most likely pitch values for this file. For ex., 
priorAdapt 
adaptive secondpass prior: if TRUE, optimal pitch contours
are estimated first with a prior determined by 
nCands 
maximum number of pitch candidates per method, normally 1...4
(except for 
minVoicedCands 
minimum number of pitch candidates that have to be
defined to consider a frame voiced (if NULL, defaults to 2 if 
pitchDom 
a list of control parameters for pitch tracking using the
lowest dominant frequency band or "dom" method; see details and

pitchAutocor 
a list of control parameters for pitch tracking using the
autocorrelation or "autocor" method; see details and

pitchCep 
a list of control parameters for pitch tracking using the
cepstrum or "cep" method; see details and 
pitchSpec 
a list of control parameters for pitch tracking using the
BaNa or "spec" method; see details and 
pitchHps 
a list of control parameters for pitch tracking using the
harmonic product spectrum or "hps" method; see details and

pitchZc 
a list of control parameters for pitch tracking based on zero
crossings in bandpassfiltered audio or "zc" method; see

harmHeight 
a list of control parameters for estimating how high
harmonics reach in the spectrum; see details and 
subh 
a list of control parameters for estimating the strength of
subharmonics per frame  that is, spectral energy at integer ratios of f0:
see 
flux 
a list of control parameters for calculating featurebased flux
(not spectral flux) passed to 
amRange 
target range of frequencies for amplitude modulation, Hz: a
vector of length 2 (affects both 
fmRange 
target range of frequencies for analyzing frequency
modulation, Hz ( 
shortestSyl 
the smallest length of a voiced segment (ms) that constitutes a voiced syllable (shorter segments will be replaced by NA, as if unvoiced) 
shortestPause 
the smallest gap between voiced syllables (ms): large value = interpolate and merge, small value = treat as separate syllables separated by an unvoiced gap 
interpol 
a list of parameters (currently 
pathfinding 
method of finding the optimal path through pitch
candidates: 'none' = best candidate per frame, 'fast' = simple heuristic,
'slow' = annealing. See 
annealPars 
a list of control parameters for postprocessing of
pitch contour with SANN algorithm of 
certWeight 
(0 to 1) in pitch postprocessing, specifies how much we prioritize the certainty of pitch candidates vs. pitch jumps / the internal tension of the resulting pitch curve 
snakeStep 
optimized path through pitch candidates is further
processed to minimize the elastic force acting on pitch contour. To
disable, set 
snakePlot 
if TRUE, plots the snake 
smooth, smoothVars 
if 
summaryFun 
functions used to summarize each acoustic characteristic, eg "c('mean', 'sd')"; userdefined functions are fine (see examples); NAs are omitted automatically for mean/median/sd/min/max/range/sum, otherwise take care of NAs yourself 
invalidArgAction 
what to do if an argument is invalid or outside the
range in 
reportEvery 
when processing multiple inputs, report estimated time left every ... iterations (NULL = default, NA = don't report) 
cores 
number of cores for parallel processing 
plot 
if TRUE, produces a spectrogram with pitch contour overlaid 
osc 
"none" = no oscillogram; "linear" = on the original scale; "dB" = in decibels 
showLegend 
if TRUE, adds a legend with pitch tracking methods 
savePlots 
full path to the folder in which to save the plots (NULL = don't save, ” = same folder as audio) 
pitchPlot 
a list of graphical parameters for displaying the final
pitch contour. Set to 
extraContour 
name of an output variable to overlap on the pitch
contour plot, eg 'peakFreq' or 'loudness'; can also be a list with extra
graphical parameters, eg 
ylim 
frequency range to plot, kHz (defaults to 0 to Nyquist frequency). NB: still in kHz, even if yScale = bark, mel, or ERB 
xlab, ylab, main 
plotting parameters 
width, height, units, res 
parameters passed to

... 
other graphical parameters passed to 
Each pitch tracker is controlled by its own list of settings, as follows:
pitchDom
(lowest dominant frequency band)domThres
(0 to 1) to find the lowest dominant frequency band, we
do shortterm FFT and take the lowest frequency with amplitude at least
domThres
domSmooth
the width of smoothing interval (Hz) for
finding dom
pitchAutocor
(autocorrelation)autocorThres
voicing threshold (unitless, ~0 to 1)
autocorSmooth
the width of smoothing interval (in bins) for
finding peaks in the autocorrelation function. Defaults to 7 for sampling
rate 44100 and smaller odd numbers for lower values of sampling rate
autocorUpsample
upsamples acf to this resolution (Hz) to improve
accuracy in high frequencies
autocorBestPeak
amplitude of the
lowest best candidate relative to the absolute max of the acf
pitchCep
(cepstrum)cepThres
voicing
threshold (unitless, ~0 to 1)
cepZp
zeropadding of the spectrum
used for cepstral pitch detection (final length of spectrum after
zeropadding in points, e.g. 2 ^ 13)
pitchSpec
(ratio of
harmonics  BaNa algorithm)specThres
voicing
threshold (unitless, ~0 to 1)
specPeak,specHNRslope
when looking
for putative harmonics in the spectrum, the threshold for peak detection is
calculated as specPeak * (1  HNR * specHNRslope)
specSmooth the width of window for detecting peaks in the spectrum, Hz
specMerge
pitch candidates within specMerge
semitones are
merged with boosted certainty
specSinglePeakCert
(0 to 1) if F0
is calculated based on a single harmonic ratio (as opposed to several ratios
converging on the same candidate), its certainty is taken to be
specSinglePeakCert
hpsNum
the number of times to downsample the
spectrum
hpsThres
voicing threshold (unitless, ~0 to 1)
hpsNorm
the amount of inflation of hps pitch certainty (0 = none)
hpsPenalty
the amount of penalizing hps candidates in low
frequencies (0 = none)
Each of these lists also accepts graphical
parameters that affect how pitch candidates are plotted, eg pitchDom =
list(domThres = .5, col = 'yellow')
. Other arguments that are lists of
subroutinespecific settings include:
harmonicHeight
(finding how high harmonics reach in the spectrum)harmThres
minimum height of spectral peak, dB
harmPerSel
the number of harmonics per sliding selection
harmTol
maximum
tolerated deviation of peak frequency from multiples of f0, proportion of f0
Returns a list with $detailed
framebyframe descriptives and
a $summary
with one row per file, as determined by summaryFun
(e.g., mean / median / SD of each acoustic variable across all STFT
frames). Output measures include:
total duration, s
duration from the beginning of the first
nonsilent STFT frame to the end of the last nonsilent STFT frame, s (NB:
depends strongly on windowLength
and silence
settings)
time of the middle of each frame (ms)
frequency (Hz) and depth (0 to 1) of amplitude modulation estimated from a smoothed amplitude envelope
frequency and purity of amplitude
modulation estimated via modulationSpectrum
root mean square of amplitude per frame, calculated as sqrt(mean(frame ^ 2))
same as ampl
, but
ignoring silent frames
Cepstral Peak Prominence, dB (a measure of pitch quality, the ratio of the highest peak in the cepstrum to the regression line drawn through it)
lowest dominant frequency band (Hz) (see "Pitch tracking methods / Dominant frequency" in the vignette)
Weiner entropy of the spectrum of the current frame. Close to 0: pure tone or tonal sound with nearly all energy in harmonics; close to 1: white noise
Normalized Shannon entropy of the spectrum of the current frame: 0 = pure tone, 1 = white noise
the frequency and bandwidth of the first nFormants formants per STFT frame, as calculated by phonTools::findformants
featurebased flux, the rate of change
in acoustic features such as pitch, HNR, etc. (0 = none, 1 = max); "epoch"
is an audio segment between two peaks of flux that exceed a threshold of
flux = list(thres = ...)
(listed in output$detailed only)
frequency of frequency modulation (FM) such as vibrato or jitter, Hz
depth of FM, semitones
purity or dominance of the main FM frequency (fmFreq), 0 to 1
the amount of energy in upper harmonics, namely the ratio of total spectral mass above 1.25 x F0 to the total spectral mass below 1.25 x F0 (dB)
how high harmonics reach in the spectrum, based on the best guess at pitch (or the manually provided pitch values)
harmonicstonoise ratio (dB), a measure of harmonicity returned by soundgen:::getPitchAutocor (see "Pitch tracking methods / Autocorrelation"). If HNR = 0 dB, there is as much energy in harmonics as in noise
subjective loudness, in sone, corresponding to
the chosen SPL_measured  see getLoudness
spectral novelty  a measure of how variable the spectrum is
on a particular time scale, as estimated by ssm
the frequency with maximum spectral power (Hz)
postprocessed pitch contour based on all F0 estimates
the 25th, 50th, and 75th quantiles of the spectrum of voiced frames (Hz)
the amount of amplitude modulation, see modulationSpectrum
the center of gravity of the frame’s spectrum, first spectral moment (Hz)
the slope of linear regression fit to the spectrum below cutFreq (dB/kHz)
estimated depth of subharmonics per frame: 0 = none, 1 = as strong as f0. NB: this depends critically on accurate pitch tracking
the ratio of f0 to subharmonics frequency with strength subDep: 2 = period doubling, 3 = f0 / 3, etc.
is the current STFT frame voiced? TRUE / FALSE
pitch_app
getLoudness
segment
getRMS
sound = soundgen(sylLen = 300, pitch = c(500, 400, 600),
noise = list(time = c(0, 300), value = c(40, 0)),
temperature = 0.001,
addSilence = 50) # NB: always have some silence before and after!!!
# playme(sound, 16000)
a = analyze(sound, samplingRate = 16000, plot = TRUE)
str(a$detailed) # framebyframe
a$summary # summary per sound
## Not run:
# For maximum processing speed (just basic spectral descriptives):
a = analyze(sound, samplingRate = 16000,
plot = FALSE, # no plotting
pitchMethods = NULL, # no pitch tracking
loudness = NULL, # no loudness analysis
novelty = NULL, # no novelty analysis
roughness = NULL, # no roughness analysis
nFormants = 0 # no formant analysis
)
# Take a sound hard to analyze b/c of subharmonics and jitter
sound2 = soundgen(sylLen = 900, pitch = list(
time = c(0, .3, .8, 1), value = c(300, 900, 400, 2300)),
noise = list(time = c(0, 900), value = c(40, 20)),
subDep = 10, jitterDep = 0.5,
temperature = 0.001, samplingRate = 44100, pitchSamplingRate = 44100)
# playme(sound2, 44100)
a2 = analyze(sound2, samplingRate = 44100, priorSD = 24,
plot = TRUE, ylim = c(0, 5))
# Compare the available pitch trackers
analyze(sound2, 44100,
pitchMethods = c('dom', 'autocor', 'spec', 'cep', 'hps', 'zc'),
# don't use priors to see weird pitch candidates better
priorMean = NA, priorAdapt = FALSE,
plot = TRUE, yScale = 'bark')
# Fancy plotting options:
a = analyze(sound2, samplingRate = 44100, plot = TRUE,
xlab = 'Time, ms', colorTheme = 'seewave',
contrast = .5, ylim = c(0, 4), main = 'My plot',
pitchMethods = c('dom', 'autocor', 'spec', 'hps', 'cep'),
priorMean = NA, # no prior info at all
pitchDom = list(col = 'red', domThres = .25),
pitchPlot = list(col = 'black', pch = 9, lty = 3, lwd = 3),
extraContour = list(x = 'peakFreq', type = 'b', pch = 4, col = 'brown'),
osc = 'dB', heights = c(2, 1))
# Analyze an entire folder in one go, saving spectrograms with pitch contours
# plus an html file for easy access
s2 = analyze('~/Downloads/temp',
savePlots = '', # save in the same folder as audio
showLegend = TRUE, yScale = 'bark',
width = 20, height = 12,
units = 'cm', res = 300, ylim = c(0, 5))
s2$summary[, 1:5]
# Different options for summarizing the output
a = analyze(sound1, 44100,
summaryFun = c('mean', 'range'))
a$summary # one row per sound
# ...with custom summaryFun, eg time of peak relative to duration (0 to 1)
timePeak = function(x) which.max(x) / length(x) # without omitting NAs
timeTrough = function(x) which.min(x) / length(x)
a = analyze(sound2, samplingRate = 16000,
summaryFun = c('mean', 'timePeak', 'timeTrough'))
colnames(a$summary)
# Analyze a selection rather than the whole sound
a = analyze(sound, samplingRate = 16000, from = .1, to = .3, plot = TRUE)
# Use only a range of frequencies when calculating spectral descriptives
# (ignore everything below 100 Hz and above 8000 Hz as irrelevant noise)
a = analyze(sound, samplingRate = 16000, cutFreq = c(100, 8000))
## Amplitude and loudness: analyze() should give the same results as
# dedicated functions getRMS() / getLoudness()
# Create 1 kHz tone
samplingRate = 16000; dur_ms = 50
sound3 = sin(2*pi*1000/samplingRate*(1:(dur_ms/1000*samplingRate)))
a1 = analyze(sound3, samplingRate = samplingRate, scale = 1,
windowLength = 25, overlap = 50,
loudness = list(SPL_measured = 40),
pitchMethods = NULL, plot = FALSE)
a1$detailed$loudness # loudness per STFT frame (1 sone by definition)
getLoudness(sound3, samplingRate = samplingRate, windowLength = 25,
overlap = 50, SPL_measured = 40, scale = 1)$loudness
a1$detailed$ampl # RMS amplitude per STFT frame
getRMS(sound3, samplingRate = samplingRate, windowLength = 25,
overlap = 50, scale = 1)$detailed
# or even simply: sqrt(mean(sound3 ^ 2))
# The same sound as above, but with half the amplitude
a_half = analyze(sound3 / 2, samplingRate = samplingRate, scale = 1,
windowLength = 25, overlap = 50,
loudness = list(SPL_measured = 40),
pitchMethods = NULL, plot = FALSE)
a1$detailed$ampl / a_half$detailed$ampl # rms amplitude halved
a1$detailed$loudness/ a_half$detailed$loudness
# loudness is not a linear function of amplitude
# Analyzing ultrasounds (slow but possible, just adjust pitchCeiling)
s = soundgen(sylLen = 100, addSilence = 10,
pitch = c(25000, 35000, 30000),
formants = NA, rolloff = 12, rolloffKHz = 0,
pitchSamplingRate = 350000, samplingRate = 350000, windowLength = 5,
pitchCeiling = 45000, invalidArgAction = 'ignore')
# s is a batlike ultrasound inaudible to humans
a = analyze(s, 350000, plot = TRUE,
pitchCeiling = 45000, priorMean = NA,
windowLength = 2, overlap = 0,
nFormants = 0, loudness = NULL)
# NB: ignore formants and loudness estimates for such nonhuman sounds
# download 260 sounds from Anikin & Persson (2017)
# http://cogsci.se/publications/anikinpersson_2017_nonlinguisticvocs/260sounds_wav.zip
# unzip them into a folder, say '~/Downloads/temp'
myfolder = '~/Downloads/temp' # 260 .wav files live here
s = analyze(myfolder) # ~ 1020 minutes!
# s = write.csv(s, paste0(myfolder, '/temp.csv')) # save a backup
# Check accuracy: import manually verified pitch values (our "key")
# pitchManual # "ground truth" of mean pitch per sound
# pitchContour # "ground truth" of complete pitch contours per sound
files_manual = paste0(names(pitchManual), '.wav')
idx = match(s$file, files_manual) # in case the order is wrong
s$key = pitchManual[idx]
# Compare manually verified mean pitch with the output of analyze:
cor(s$key, s$summary$pitch_median, use = 'pairwise.complete.obs')
plot(s$key, s$summary$pitch_median, log = 'xy')
abline(a=0, b=1, col='red')
# Rerunning analyze with manually corrected contours gives correct
pitchrelated descriptives like amplVoiced and harmonics (NB: you get it "for
free" when running pitch_app)
s1 = analyze(myfolder, pitchManual = pitchContour)
plot(s$summary$harmonics_median, s1$summary$harmonics_median)
abline(a=0, b=1, col='red')
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.