knitr::opts_chunk$set(
  collapse = TRUE,
  comment = ""
)
library(avutils)

Overview

The idea here is to provide R functions that wrap up the shell commands normally used to invoke the virtual machine with the DiViMe tools. The functions are by and large calls using the system2() functions (or system()), with some processing around these calls. There is now also an early version of a shiny app (try divime_app()).

par(family = "serif", mar = c(0, 0, 0, 0))
plot(0, 0, type = "n", xlim = c(0, 50), ylim = c(20, 0), axes = FALSE, ann = FALSE)
textcex <- 0.4
r <- FALSE

cds <- c(1, 4, 9, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "speech activity detection", cex = textcex)

cds <- c(1, 6, 9, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_sad()", cex = textcex, family = "mono")

cds <- c(1, 10, 9, 14)
labs <- "noisemesSad_X.rttm\nopensmileSad_X.rttm\ntocomboSad_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(1, 16, 9, 20)
labs <- "noisemesSad.sh\nopensmileSad.sh\ntocomboSad.sh"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)



# diarization -------------------------------------------------------------

cds <- c(11, 4, 19, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "talker diarization", cex = textcex)

cds <- c(11, 6, 19, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_diarization(speech_annos = ...)", cex = textcex, family = "mono")

cds <- c(11, 10, 19, 14)
labs <- "diartk_noisemesSad_X.rttm\ndiartk_opensmileSad_X.rttm\ndiartk_tocomboSad_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(11, 16, 19, 20)
labs <- "diartk.sh ... noisemesSad\ndiartk.sh ... opensmileSad\ndiartk.sh ... tocomboSad"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

# talker type -------------------------------------------------------------

cds <- c(21, 4, 29, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "talker type", cex = textcex)

cds <- c(21, 6, 29, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_talkertype(marvinator = ...)", cex = textcex, family = "mono")

cds <- c(21, 10, 29, 14)
labs <- "yunitator_old_X.rttm\nyunitator_english_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(21, 16, 29, 20)
labs <- "yunitate.sh\nyunitate.sh ... english"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)


# vocalization classification -------------------------------------------------------------

cds <- c(31, 4, 39, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "classify vocalizations", cex = textcex)

cds <- c(31, 6, 39, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_classify_vox(marvinator = ...)", cex = textcex, family = "mono")

cds <- c(31, 10, 39, 14)
labs <- "vcm_X.rttm\nvcm_english_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(31, 16, 39, 20)
labs <- "vcm.sh\nvcm.sh ... english"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)


# word count -------------------------------------------------------------

cds <- c(41, 4, 49, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "word count", cex = textcex)

cds <- c(41, 6, 49, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_wordcount(speech_annos = ...)", cex = textcex, family = "mono")

cds <- c(41, 10, 49, 14)
labs <- "WCE_noisemesSad_X.rttm\nWCE_opensmileSad_X.rttm\nWCE_tocomboSad_X.rttm\nWCE_yunitator_english_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(41, 16, 49, 20)
labs <- "WCE_from_SAD_outputs.sh ... noisemesSad\nWCE_from_SAD_outputs.sh ... opensmileSad\nWCE_from_SAD_outputs.sh ... tocomboSad\nWCE_from_SAD_outputs.sh ... english"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

Here is a tabular overview.

headers <- c("task", "function", "main argument", "requires", "output template", "comment")

sad1 <- c("**speech detection**", "`divime_sad`", "`module = \"noisemes\"`", " ", "noisemesSad_X", "")
sad2 <- c(" ", " ",                           "`module = \"opensmile\"`", " ", "opensmileSad_X", "")
sad3 <- c(" ", " ",                           "`module = \"tocombo\"`", " ", "tocomboSad_X", "")

diar1 <- c("**talker diarization**", "`divime_diarization`", "`speech_annos = \"noisemes\"`", "noisemesSad_X", "diartk_noisemesSad_X", "identification of individuals speakers")
diar2 <- c(" ", " ",                                     "`speech_annos = \"opensmile\"`", "opensmileSad_X", "diartk_noisemesSad_X", "")
diar3 <- c(" ", " ",                                     "`speech_annos = \"tocombo\"`", "tocomboSad_X", "diartk_tocomboSad_X", "")

yuni1 <- c("**talker type**", "`divime_talkertype`", "`marvinator = FALSE`", " ", "yunitator_old_X", "CHI vs FEM vs MAL")
yuni2 <- c(" ", " ",                             "`marvinator = TRUE`", " ", "yunitator_english_X", "")

vcm1 <- c("**vocalization classification**", "`divime_classify_vox`", "`marvinator = FALSE`", "yunitator_old_X", "vcm_X", "CNS vs NCS vs CRY vs OTH")
vcm2 <- c(" ", " ",                                               "`marvinator = TRUE`", "yunitator_english_X", "vcm_marvinator_X", "")


wce1 <- c("**word count**", "`divime_wordcount`", "`speech_annos = \"noisemes\"`", "noisemesSad_X", "WCE_noisemesSad_X", "")
wce2 <- c(" ", " ",                           "`speech_annos = \"opensmile\"`", "opensmileSad_X", "WCE_tocomboSad_", "")
wce3 <- c(" ", " ",                           "`speech_annos = \"tocombo\"`", "tocomboSad_X", "WCE_tocomboSad_X", "")
wce4 <- c(" ", " ",                           "`speech_annos = \"yunitator_english\"`", "yunitator_english_X", "WCE_yunitator_english_X", "")

xtab <- rbind(sad1, sad2, sad3, diar1, diar2, diar3, yuni1, yuni2, vcm1, vcm2, wce1, wce2, wce3, wce4)
colnames(xtab) <- headers
knitr::kable(xtab,row.names = FALSE)

Set up

Requirements

What you need as requirement is the vagrant software and the VirtualBox installed, which provide the environment in which you can set up and run your virtual machine, which in effect gives you access to the DiViMe tools.

If you are on Windows you might also need git. If you are on a Mac chances are you have got git already.

For more details see https://divime.readthedocs.io/en/latest/.

For Windows users: whenever file paths need to be specified please use single forward slashes, e.g. c:/data/myaudiofiles

First off, you will need the following R packages installed:

Regarding the DiViMe tools themselves, you will require a fair amount of disk space on the drive where you are going to install the virtual machine (25GB+).

You also need sufficient working memory, at least 8GB.

The DiViMe documentation also lists at least two processor cores as requirement but allows adapting in case your computer only has one core. The function to set up the VM (described below) will check how many cores you have and adapt the installation process if necessary.

Regardless of whether you are using Windows or MacOS you will need vagrant and VirtualBox. I can't be sure whether the package works with all combinations of versions of these two programs, so if you want to play it safe, you use vagrant 2.2.4 and VirtualBox 5.2.28, which are the versions that I used while writing this package.

If you use Windows, you probably also need git (https://git-scm.com/download/win), and I expect that version differences here don't make any difference so take the most recent version.

If you use a Mac, you might need to install Xcode in order to get access to git, but I'm not sure about this. So, try the test first that is mentioned just below and only if that fails, go ahead and install Xcode.

Once you installed everything, you can check whether so far all is set up correctly by using (you probably need to restart your computer though):

system2(Sys.which("vagrant"), "-v")
system2(Sys.which("git"), "--version")

If this results in some meaningful output, i.e. you get some info about about the versions installed, then you are good to go.

Set up the DiViMe virtual machine

This step describes the function that sets up the virtual machine. You can also omit this step and follow the instructions described here: https://divime.readthedocs.io/en/latest/install.html. In either case you need a running internet connection. If you already have a virtual machine installed, you can skip this section and go ahead to the next about testing whether your virtual machine can be accessed from R.

The function divime_vagrant_setup will take care of these steps without need for using bash commands. There are three things to note here:

# with a Mac
divime_vagrant_setup(divime_loc = "/Volumes/Data/VMX", memoryoverride = 4096)
# with Windows
divime_vagrant_setup(divime_loc = "C:/Data/VMX", memoryoverride = 4096)

Testing your setup

Testing the virtual machine

Once the setup step is completed, let's test whether we can communicate with the virtual machine from within R. For this we use divime_vagrant_state(), which allows checking or changing the state of the virtual machine. Note that you need to ammend the path of the installation with DiViMe/.

# with a Mac
divime_vagrant_state(divime_loc = "/Volumes/Data/VMX/DiViMe/", what = "status")
# with Windows
divime_vagrant_state(divime_loc = "C:/Data/VMX/DiViMe/", what = "status")

If this results in a printed message "running (virtualbox)", things are looking good. If you don't get this message, try:

# with a Mac
divime_vagrant_state(divime_loc = "/Volumes/Data/VMX/DiViMe/", what = "start")
# with Windows
divime_vagrant_state(divime_loc = "C:/Data/VMX/DiViMe/", what = "start")

Testing the DiViMe tools

If the above steps were successful, you can run the test protocol of the DiViMe package. Running this will take a few minutes because it will download some example data.

# with a Mac
divime_test(divime_loc = "/Volumes/Data/VMX/DiViMe/")
# with Windows
divime_test(divime_loc = "C:/Data/VMX/DiViMe/")

This step downloads a sound file from TalkBank and runs this file (a 5-minute segment) through most of the tools in the DiViMe package. The output of this function should be self-explanatory if successful.

Using the divime_ functions

Currently, there are the following functions available to run specific DiViMe tools:

The general idea of how these functions work is the same for all:

What the functions then do is to copy your audio file to your DiViMe location, have the DiViMe tools process them there, and copy the output (.rttm file(s)) back to the location of your audio files. After that is done, the files will be deleted from the DiViMe location again.

During these steps there are again a few things to note:

In practice, working with the divime_ functions could look like this.

# Mac
myaudio <- "/Volumes/Data/audio"      # location for audio files
divime <- "/Volumes/Data/VMX/DiViMe/" # location for DiViMe VM

# Windows
myaudio <- "D:/Data/audio"      # location for audio files
divime <- "C:/Data/VMX/DiViMe/" # location for DiViMe VM

If your audio location contains two .wav files, e.g. 'test1.wav' and test2.wav', running the following command will add two more files to this location ('opensmileSad_test1.rttm' and 'opensmileSad_test2.rttm').

divime_sad(divime_loc = divime, audio_loc = myaudio, module = "opensmile")

If you then want to run the diarization tool, the following function will add two more files to your audio location ('diartk_opensmileSad_test1.rttm' and 'diartk_opensmileSad_test2.rttm').

divime_diarization(divime_loc = divime, audio_loc = myaudio, speech_annos = "opensmile")

Let's go for a proper example, i.e. using two of the example files provided in the package. These short files contain a few sentences spoken by a voice synthesizer. We start by setting up temporary folder and copy the sound files there. (Note that I use the R temporary file folder for this, which is deleted once you close R).

tdir <- file.path(normalizePath(tempdir(), winslash = "/"), "avutils")
dir.create(tdir)
file.copy(from = system.file("synthetic_speech.wav", package = "avutils"),
          to = file.path(tdir, "synthetic_speech.wav"))
file.copy(from = system.file("synthetic_speech_overlap.wav", package = "avutils"),
          to = file.path(tdir, "synthetic_speech_overlap.wav"))
# open location in file browser
shell.exec(tdir) # Windows
system2("open", tdir) # Mac
file.copy(from = system.file("opensmileSad_synthetic_speech.rttm", package = "avutils"),
          to = file.path(tdir, "opensmileSad_synthetic_speech.rttm"))
file.copy(from = system.file("opensmileSad_synthetic_speech_overlap.rttm", package = "avutils"),
          to = file.path(tdir, "opensmileSad_synthetic_speech_overlap.rttm"))
file.copy(from = system.file("noisemesSad_synthetic_speech.rttm", package = "avutils"),
          to = file.path(tdir, "noisemesSad_synthetic_speech.rttm"))
file.copy(from = system.file("noisemesSad_synthetic_speech_overlap.rttm", package = "avutils"),
          to = file.path(tdir, "noisemesSad_synthetic_speech_overlap.rttm"))

Now let's run two speech detection algorithms on the two files in this folder.

divime_sad(audio_loc = tdir, divime_loc = "/Volumes/Data/VMX/DiViMe/", module = "opensmile", vmshutdown = FALSE)
divime_sad(audio_loc = tdir, divime_loc = "/Volumes/Data/VMX/DiViMe/", module = "noisemes", vmshutdown = FALSE)

This should lead to four .rttm files being created in the data folder:

list.files(tdir)

Now let's read back one of these files.

file1 <- read_rttm(file.path(tdir, "opensmileSad_synthetic_speech.rttm"))
file1

Please note that during the reading with read_rttm() I assign column names that are not present in the actual .rttm file and I create an extra column to set the end time of each segment.

cat(paste(readLines(file.path(tdir, "opensmileSad_synthetic_speech.rttm")), collapse = "\n"))

DiViMe shiny app

The app is still in an early development stage.

divime_app() # starts app in your default browser
divime_app(launchinbrowser = FALSE) # starts app in RStudio window (if you use RStudio)

Evaluation of speech detection (NEEDS MAJOR REWORKING!!!)

We start by providing the paths to two annotation files (one manually coded with ELAN, one the output from noisemes), which are part of the package and relate to an audio file that I made up with synthetic speech.

# a 'test' file
x1 <- system.file("noisemesSad_synthetic_speech.rttm", package = "avutils")
# the gold standard reference (manual coding)
x2 <- system.file("synthetic_speech.eaf", package = "avutils")

The first example is 'normal' in that both files contain detected speech and the overlap between the two annotation sets is fairly good.

evaluate_sad(test = x1, reference = x2, overlap_res = 500)

Let's first consider the graphical output. We see two sets of lines, one black and one gold, which refer to the 'test' file (x1) and the reference file (or the 'gold standard', x2 in our case). The lines represent the speech detected in both annotation sets. When you use ELAN files, note that the function will take all annotations from all tiers and consider them speech. If you want to exclude certain annotations or tiers, you can exclude them by providing the relevant arguments (see ?collapse_tiers for more details on this). On the left side, we can see the time stamps (in seconds) that are covered.

The next thing to note is the top horizontal axis (evaluation points). The corresponding argument (overlap_res=) controls how many evaluation points are used. At each evaluation point, the function compares the two annotation sets with respect to whether an annotation exists at this point or not. This is somewhat contrasting the evaluation pipeline in the Python version, where each individual sample is evaluated.

The last type of information provided by the graphic are red circles between the two line sets. These circles (if present) indicate a mismatch between the two annotation sets, i.e. one file has speech detected at this point while the other has not.

The numerical output provides three types of information. First, the false alarm rate, which takes all evaluation points that show no speech in the standard x2 and calculates the proportion of evaluation points that (falsly) contain speech in the test file x1. The missed speech rate is the opposite, that is the proportion of no-speech points in x1 when there is speech in the standard x2. Finally, there is the total congruence, which is just the total proportion of all evaluation points where x1 has the same value as x2. So, for FA and MSR, small values close to 0 are better, while for the total 1 is the best value.

For a proper evaluation, keep in mind to set overlap_res= to a reasonably large value. One rule of thumb could be to use ten evaluation points per second recording, which would provide precision to the tenth of a second.

Visual versus numerical results

In order to aid visual assessment, evaluate_sad() allows to break the display into multiple rows and/or to select a time segment within the file. Note that when you 'break' the file over multiple segments, all the evaluation metrics are returned for each segment separately. In the specific example that follows, note that in the last segment, there are no false alarms at all.

# break the display over 5 segments
evaluate_sad(test = x1, reference = x2, nsegments = 5, overlap_res = 100)

You can also select a time window. (the example here corresponds roughly to the second segment from just above).

evaluate_sad(test = x1, reference = x2, from = 9.46, to = 18.94, overlap_res = 100)

Special cases

The DiViMe manual considers special cases when one or two of the files submitted contain no speech. Let's look at this with an example where we use predefined segments of the example files.

First, we consider the special case where the gold/manual annotation is empty and also the SAD module detected no speech.

evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 27, to = 28, overlap_res = 10)

As expected, both FA and MSR are 0 (as defined in the DiViMe manual).

Second, we consider the next case where the gold/manual annotation is empty but the SAD module detected some speech.

evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 27, to = 28.3, overlap_res = 10)

Here we see that the false alarm rate is 1 while the missed speech rate is set to 0.

And finally, the case where there was speech in the gold standard, but the SAD module didn't detect anything:

evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 23.2, to = 25, overlap_res = 10)

Here, FA = 0 and MSR is 1.

splitting files

par(mfrow = c(1, 3), family = "serif")
evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 27, to = 28, overlap_res = 10)
evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 27, to = 28.3, overlap_res = 10)
evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 23.2, to = 25, overlap_res = 10)


gobbios/avutils documentation built on Feb. 19, 2020, 9:44 a.m.