knitr::opts_chunk$set( collapse = TRUE, comment = "" )
library(avutils)
The idea here is to provide R functions that wrap up the shell commands normally used to invoke the virtual machine with the DiViMe tools. The functions are by and large calls using the system2()
functions (or system()
), with some processing around these calls. There is now also an early version of a shiny app (try divime_app()
).
par(family = "serif", mar = c(0, 0, 0, 0)) plot(0, 0, type = "n", xlim = c(0, 50), ylim = c(20, 0), axes = FALSE, ann = FALSE) textcex <- 0.4 r <- FALSE cds <- c(1, 4, 9, 2) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "speech activity detection", cex = textcex) cds <- c(1, 6, 9, 8) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_sad()", cex = textcex, family = "mono") cds <- c(1, 10, 9, 14) labs <- "noisemesSad_X.rttm\nopensmileSad_X.rttm\ntocomboSad_X.rttm" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) cds <- c(1, 16, 9, 20) labs <- "noisemesSad.sh\nopensmileSad.sh\ntocomboSad.sh" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) # diarization ------------------------------------------------------------- cds <- c(11, 4, 19, 2) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "talker diarization", cex = textcex) cds <- c(11, 6, 19, 8) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_diarization(speech_annos = ...)", cex = textcex, family = "mono") cds <- c(11, 10, 19, 14) labs <- "diartk_noisemesSad_X.rttm\ndiartk_opensmileSad_X.rttm\ndiartk_tocomboSad_X.rttm" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) cds <- c(11, 16, 19, 20) labs <- "diartk.sh ... noisemesSad\ndiartk.sh ... opensmileSad\ndiartk.sh ... tocomboSad" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) # talker type ------------------------------------------------------------- cds <- c(21, 4, 29, 2) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "talker type", cex = textcex) cds <- c(21, 6, 29, 8) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_talkertype(marvinator = ...)", cex = textcex, family = "mono") cds <- c(21, 10, 29, 14) labs <- "yunitator_old_X.rttm\nyunitator_english_X.rttm" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) cds <- c(21, 16, 29, 20) labs <- "yunitate.sh\nyunitate.sh ... english" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) # vocalization classification ------------------------------------------------------------- cds <- c(31, 4, 39, 2) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "classify vocalizations", cex = textcex) cds <- c(31, 6, 39, 8) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_classify_vox(marvinator = ...)", cex = textcex, family = "mono") cds <- c(31, 10, 39, 14) labs <- "vcm_X.rttm\nvcm_english_X.rttm" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) cds <- c(31, 16, 39, 20) labs <- "vcm.sh\nvcm.sh ... english" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) # word count ------------------------------------------------------------- cds <- c(41, 4, 49, 2) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "word count", cex = textcex) cds <- c(41, 6, 49, 8) if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_wordcount(speech_annos = ...)", cex = textcex, family = "mono") cds <- c(41, 10, 49, 14) labs <- "WCE_noisemesSad_X.rttm\nWCE_opensmileSad_X.rttm\nWCE_tocomboSad_X.rttm\nWCE_yunitator_english_X.rttm" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex) cds <- c(41, 16, 49, 20) labs <- "WCE_from_SAD_outputs.sh ... noisemesSad\nWCE_from_SAD_outputs.sh ... opensmileSad\nWCE_from_SAD_outputs.sh ... tocomboSad\nWCE_from_SAD_outputs.sh ... english" if(r) rect(cds[1], cds[2], cds[3], cds[4]) text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)
Here is a tabular overview.
headers <- c("task", "function", "main argument", "requires", "output template", "comment") sad1 <- c("**speech detection**", "`divime_sad`", "`module = \"noisemes\"`", " ", "noisemesSad_X", "") sad2 <- c(" ", " ", "`module = \"opensmile\"`", " ", "opensmileSad_X", "") sad3 <- c(" ", " ", "`module = \"tocombo\"`", " ", "tocomboSad_X", "") diar1 <- c("**talker diarization**", "`divime_diarization`", "`speech_annos = \"noisemes\"`", "noisemesSad_X", "diartk_noisemesSad_X", "identification of individuals speakers") diar2 <- c(" ", " ", "`speech_annos = \"opensmile\"`", "opensmileSad_X", "diartk_noisemesSad_X", "") diar3 <- c(" ", " ", "`speech_annos = \"tocombo\"`", "tocomboSad_X", "diartk_tocomboSad_X", "") yuni1 <- c("**talker type**", "`divime_talkertype`", "`marvinator = FALSE`", " ", "yunitator_old_X", "CHI vs FEM vs MAL") yuni2 <- c(" ", " ", "`marvinator = TRUE`", " ", "yunitator_english_X", "") vcm1 <- c("**vocalization classification**", "`divime_classify_vox`", "`marvinator = FALSE`", "yunitator_old_X", "vcm_X", "CNS vs NCS vs CRY vs OTH") vcm2 <- c(" ", " ", "`marvinator = TRUE`", "yunitator_english_X", "vcm_marvinator_X", "") wce1 <- c("**word count**", "`divime_wordcount`", "`speech_annos = \"noisemes\"`", "noisemesSad_X", "WCE_noisemesSad_X", "") wce2 <- c(" ", " ", "`speech_annos = \"opensmile\"`", "opensmileSad_X", "WCE_tocomboSad_", "") wce3 <- c(" ", " ", "`speech_annos = \"tocombo\"`", "tocomboSad_X", "WCE_tocomboSad_X", "") wce4 <- c(" ", " ", "`speech_annos = \"yunitator_english\"`", "yunitator_english_X", "WCE_yunitator_english_X", "") xtab <- rbind(sad1, sad2, sad3, diar1, diar2, diar3, yuni1, yuni2, vcm1, vcm2, wce1, wce2, wce3, wce4) colnames(xtab) <- headers knitr::kable(xtab,row.names = FALSE)
What you need as requirement is the vagrant
software and the VirtualBox
installed, which provide the environment in which you can set up and run your virtual machine, which in effect gives you access to the DiViMe tools.
If you are on Windows you might also need git
. If you are on a Mac chances are you have got git
already.
For more details see https://divime.readthedocs.io/en/latest/.
For Windows users: whenever file paths need to be specified please use single forward slashes, e.g. c:/data/myaudiofiles
First off, you will need the following R packages installed:
xml2
tuneR
parallel
if you want to use this package to set up the DiViMe virtual machine from scratch
shiny
and shinyFiles
if you want to use the shiny interactive user interface
Regarding the DiViMe tools themselves, you will require a fair amount of disk space on the drive where you are going to install the virtual machine (25GB+).
You also need sufficient working memory, at least 8GB.
The DiViMe documentation also lists at least two processor cores as requirement but allows adapting in case your computer only has one core. The function to set up the VM (described below) will check how many cores you have and adapt the installation process if necessary.
Regardless of whether you are using Windows or MacOS you will need vagrant
and VirtualBox
. I can't be sure whether the package works with all combinations of versions of these two programs, so if you want to play it safe, you use vagrant
2.2.4 and VirtualBox
5.2.28, which are the versions that I used while writing this package.
If you use Windows, you probably also need git
(https://git-scm.com/download/win), and I expect that version differences here don't make any difference so take the most recent version.
If you use a Mac, you might need to install Xcode
in order to get access to git
, but I'm not sure about this. So, try the test first that is mentioned just below and only if that fails, go ahead and install Xcode
.
Once you installed everything, you can check whether so far all is set up correctly by using (you probably need to restart your computer though):
system2(Sys.which("vagrant"), "-v") system2(Sys.which("git"), "--version")
If this results in some meaningful output, i.e. you get some info about about the versions installed, then you are good to go.
This step describes the function that sets up the virtual machine. You can also omit this step and follow the instructions described here: https://divime.readthedocs.io/en/latest/install.html. In either case you need a running internet connection. If you already have a virtual machine installed, you can skip this section and go ahead to the next about testing whether your virtual machine can be accessed from R.
The function divime_vagrant_setup
will take care of these steps without need for using bash commands. There are three things to note here:
You need to choose a location where to install the virtual machine. (remember that the drive needs plenty free space)
By default, the virtual machine gets allocated 2GB of RAM. However, this might be not be enough to run all the DiViMe tools. You can (and probably should) set the memory higher with the argument memoryoverride
, to about half of the RAM that your computer has. For example, if you have 8GB of RAM, set memoryoverride = 4096
.
Finally, note that the process of setting up the virtual machine will take quite some time: expect at least 20 minutes, up to an hour.
# with a Mac divime_vagrant_setup(divime_loc = "/Volumes/Data/VMX", memoryoverride = 4096) # with Windows divime_vagrant_setup(divime_loc = "C:/Data/VMX", memoryoverride = 4096)
Once the setup step is completed, let's test whether we can communicate with the virtual machine from within R. For this we use divime_vagrant_state()
, which allows checking or changing the state of the virtual machine. Note that you need to ammend the path of the installation with DiViMe/
.
# with a Mac divime_vagrant_state(divime_loc = "/Volumes/Data/VMX/DiViMe/", what = "status") # with Windows divime_vagrant_state(divime_loc = "C:/Data/VMX/DiViMe/", what = "status")
If this results in a printed message "running (virtualbox)
", things are looking good. If you don't get this message, try:
# with a Mac divime_vagrant_state(divime_loc = "/Volumes/Data/VMX/DiViMe/", what = "start") # with Windows divime_vagrant_state(divime_loc = "C:/Data/VMX/DiViMe/", what = "start")
If the above steps were successful, you can run the test protocol of the DiViMe package. Running this will take a few minutes because it will download some example data.
# with a Mac divime_test(divime_loc = "/Volumes/Data/VMX/DiViMe/") # with Windows divime_test(divime_loc = "C:/Data/VMX/DiViMe/")
This step downloads a sound file from TalkBank and runs this file (a 5-minute segment) through most of the tools in the DiViMe package. The output of this function should be self-explanatory if successful.
divime_
functionsCurrently, there are the following functions available to run specific DiViMe tools:
divime_sad()
: runs SAD modules with the option to choose between opensmile
, noisemes
and tocombo
divime_diarization()
: runs diartk
with your choice of existing SAD output
divime_talkertype()
: runs yunitate
divime_classify_vox()
: runs vcm
divime_fullnoisemes()
: runs the noisemes
module but outputs all found noiseme types (including non-speech)
divime_wordcount()
: runs wce
(word count), which requires speech detection output from either divime_sad()
or divime_talkertype(marvinator = TRUE, ...)
divime_run_all()
: runs all the above tools in one go
The general idea of how these functions work is the same for all:
Specify where your VM is located.
Specify a location where you have your audio files stored.
for divime_sad()
you need to specify which SAD module you wish to run with the module=
argument
for divime_diarization()
you need to specify what kind of existing SADs you want to use with the speech_annos=
argument (sorry for the different argument names, I will eventually make them the same for both functions...)
What the functions then do is to copy your audio file to your DiViMe location, have the DiViMe tools process them there, and copy the output (.rttm file(s)) back to the location of your audio files. After that is done, the files will be deleted from the DiViMe location again.
During these steps there are again a few things to note:
one major difference of this package to the instructions of the DiViMe docs is that you can keep your files wherever you want, i.e. you don't need to copy them manually to the DiViMe folder...
one consequence of this is that you should in fact store your files outside the DiViMe location because files in the DiViMe/data folder will be deleted...
also note that if you run any of the divime_
functions, the default is that any existing output is not overwritten if it already exists. You can/have to (if desired) force overwriting of existing results (i.e. .rttm files) by setting the overwrite=
argument to TRUE
.
In practice, working with the divime_
functions could look like this.
# Mac myaudio <- "/Volumes/Data/audio" # location for audio files divime <- "/Volumes/Data/VMX/DiViMe/" # location for DiViMe VM # Windows myaudio <- "D:/Data/audio" # location for audio files divime <- "C:/Data/VMX/DiViMe/" # location for DiViMe VM
If your audio location contains two .wav files, e.g. 'test1.wav' and test2.wav', running the following command will add two more files to this location ('opensmileSad_test1.rttm' and 'opensmileSad_test2.rttm').
divime_sad(divime_loc = divime, audio_loc = myaudio, module = "opensmile")
If you then want to run the diarization tool, the following function will add two more files to your audio location ('diartk_opensmileSad_test1.rttm' and 'diartk_opensmileSad_test2.rttm').
divime_diarization(divime_loc = divime, audio_loc = myaudio, speech_annos = "opensmile")
Let's go for a proper example, i.e. using two of the example files provided in the package. These short files contain a few sentences spoken by a voice synthesizer. We start by setting up temporary folder and copy the sound files there. (Note that I use the R temporary file folder for this, which is deleted once you close R).
tdir <- file.path(normalizePath(tempdir(), winslash = "/"), "avutils") dir.create(tdir) file.copy(from = system.file("synthetic_speech.wav", package = "avutils"), to = file.path(tdir, "synthetic_speech.wav")) file.copy(from = system.file("synthetic_speech_overlap.wav", package = "avutils"), to = file.path(tdir, "synthetic_speech_overlap.wav"))
# open location in file browser shell.exec(tdir) # Windows system2("open", tdir) # Mac
file.copy(from = system.file("opensmileSad_synthetic_speech.rttm", package = "avutils"), to = file.path(tdir, "opensmileSad_synthetic_speech.rttm")) file.copy(from = system.file("opensmileSad_synthetic_speech_overlap.rttm", package = "avutils"), to = file.path(tdir, "opensmileSad_synthetic_speech_overlap.rttm")) file.copy(from = system.file("noisemesSad_synthetic_speech.rttm", package = "avutils"), to = file.path(tdir, "noisemesSad_synthetic_speech.rttm")) file.copy(from = system.file("noisemesSad_synthetic_speech_overlap.rttm", package = "avutils"), to = file.path(tdir, "noisemesSad_synthetic_speech_overlap.rttm"))
Now let's run two speech detection algorithms on the two files in this folder.
divime_sad(audio_loc = tdir, divime_loc = "/Volumes/Data/VMX/DiViMe/", module = "opensmile", vmshutdown = FALSE) divime_sad(audio_loc = tdir, divime_loc = "/Volumes/Data/VMX/DiViMe/", module = "noisemes", vmshutdown = FALSE)
This should lead to four .rttm
files being created in the data folder:
list.files(tdir)
Now let's read back one of these files.
file1 <- read_rttm(file.path(tdir, "opensmileSad_synthetic_speech.rttm")) file1
Please note that during the reading with read_rttm()
I assign column names that are not present in the actual .rttm file and I create an extra column to set the end time of each segment.
cat(paste(readLines(file.path(tdir, "opensmileSad_synthetic_speech.rttm")), collapse = "\n"))
The app is still in an early development stage.
divime_app() # starts app in your default browser divime_app(launchinbrowser = FALSE) # starts app in RStudio window (if you use RStudio)
We start by providing the paths to two annotation files (one manually coded with ELAN, one the output from noisemes), which are part of the package and relate to an audio file that I made up with synthetic speech.
# a 'test' file x1 <- system.file("noisemesSad_synthetic_speech.rttm", package = "avutils") # the gold standard reference (manual coding) x2 <- system.file("synthetic_speech.eaf", package = "avutils")
The first example is 'normal' in that both files contain detected speech and the overlap between the two annotation sets is fairly good.
evaluate_sad(test = x1, reference = x2, overlap_res = 500)
Let's first consider the graphical output. We see two sets of lines, one black and one gold, which refer to the 'test' file (x1
) and the reference file (or the 'gold standard', x2
in our case). The lines represent the speech detected in both annotation sets. When you use ELAN files, note that the function will take all annotations from all tiers and consider them speech. If you want to exclude certain annotations or tiers, you can exclude them by providing the relevant arguments (see ?collapse_tiers
for more details on this). On the left side, we can see the time stamps (in seconds) that are covered.
The next thing to note is the top horizontal axis (evaluation points). The corresponding argument (overlap_res=
) controls how many evaluation points are used. At each evaluation point, the function compares the two annotation sets with respect to whether an annotation exists at this point or not. This is somewhat contrasting the evaluation pipeline in the Python version, where each individual sample is evaluated.
The last type of information provided by the graphic are red circles between the two line sets. These circles (if present) indicate a mismatch between the two annotation sets, i.e. one file has speech detected at this point while the other has not.
The numerical output provides three types of information. First, the false alarm rate, which takes all evaluation points that show no speech in the standard x2
and calculates the proportion of evaluation points that (falsly) contain speech in the test file x1
. The missed speech rate is the opposite, that is the proportion of no-speech points in x1
when there is speech in the standard x2
. Finally, there is the total congruence, which is just the total proportion of all evaluation points where x1
has the same value as x2
. So, for FA and MSR, small values close to 0 are better, while for the total 1 is the best value.
For a proper evaluation, keep in mind to set overlap_res=
to a reasonably large value. One rule of thumb could be to use ten evaluation points per second recording, which would provide precision to the tenth of a second.
In order to aid visual assessment, evaluate_sad()
allows to break the display into multiple rows and/or to select a time segment within the file. Note that when you 'break' the file over multiple segments, all the evaluation metrics are returned for each segment separately. In the specific example that follows, note that in the last segment, there are no false alarms at all.
# break the display over 5 segments evaluate_sad(test = x1, reference = x2, nsegments = 5, overlap_res = 100)
You can also select a time window. (the example here corresponds roughly to the second segment from just above).
evaluate_sad(test = x1, reference = x2, from = 9.46, to = 18.94, overlap_res = 100)
The DiViMe manual considers special cases when one or two of the files submitted contain no speech. Let's look at this with an example where we use predefined segments of the example files.
First, we consider the special case where the gold/manual annotation is empty and also the SAD module detected no speech.
evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 27, to = 28, overlap_res = 10)
As expected, both FA and MSR are 0 (as defined in the DiViMe manual).
Second, we consider the next case where the gold/manual annotation is empty but the SAD module detected some speech.
evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 27, to = 28.3, overlap_res = 10)
Here we see that the false alarm rate is 1 while the missed speech rate is set to 0.
And finally, the case where there was speech in the gold standard, but the SAD module didn't detect anything:
evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 23.2, to = 25, overlap_res = 10)
Here, FA = 0 and MSR is 1.
par(mfrow = c(1, 3), family = "serif") evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 27, to = 28, overlap_res = 10) evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 27, to = 28.3, overlap_res = 10) evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 23.2, to = 25, overlap_res = 10)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.