In gobbios/avutils: various utilities for DiViMe tools and audio processing

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = ""
)

library(avutils)

Overview

The idea here is to provide R functions that wrap up the shell commands normally used to invoke the virtual machine with the DiViMe tools. The functions are by and large calls using the system2() functions (or system()), with some processing around these calls. There is now also an early version of a shiny app (try divime_app()).

par(family = "serif", mar = c(0, 0, 0, 0))
plot(0, 0, type = "n", xlim = c(0, 50), ylim = c(20, 0), axes = FALSE, ann = FALSE)
textcex <- 0.4
r <- FALSE

cds <- c(1, 4, 9, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "speech activity detection", cex = textcex)

cds <- c(1, 6, 9, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_sad()", cex = textcex, family = "mono")

cds <- c(1, 10, 9, 14)
labs <- "noisemesSad_X.rttm\nopensmileSad_X.rttm\ntocomboSad_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(1, 16, 9, 20)
labs <- "noisemesSad.sh\nopensmileSad.sh\ntocomboSad.sh"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)



# diarization -------------------------------------------------------------

cds <- c(11, 4, 19, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "talker diarization", cex = textcex)

cds <- c(11, 6, 19, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_diarization(speech_annos = ...)", cex = textcex, family = "mono")

cds <- c(11, 10, 19, 14)
labs <- "diartk_noisemesSad_X.rttm\ndiartk_opensmileSad_X.rttm\ndiartk_tocomboSad_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(11, 16, 19, 20)
labs <- "diartk.sh ... noisemesSad\ndiartk.sh ... opensmileSad\ndiartk.sh ... tocomboSad"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

# talker type -------------------------------------------------------------

cds <- c(21, 4, 29, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "talker type", cex = textcex)

cds <- c(21, 6, 29, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_talkertype(marvinator = ...)", cex = textcex, family = "mono")

cds <- c(21, 10, 29, 14)
labs <- "yunitator_old_X.rttm\nyunitator_english_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(21, 16, 29, 20)
labs <- "yunitate.sh\nyunitate.sh ... english"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)


# vocalization classification -------------------------------------------------------------

cds <- c(31, 4, 39, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "classify vocalizations", cex = textcex)

cds <- c(31, 6, 39, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_classify_vox(marvinator = ...)", cex = textcex, family = "mono")

cds <- c(31, 10, 39, 14)
labs <- "vcm_X.rttm\nvcm_english_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(31, 16, 39, 20)
labs <- "vcm.sh\nvcm.sh ... english"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)


# word count -------------------------------------------------------------

cds <- c(41, 4, 49, 2)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "word count", cex = textcex)

cds <- c(41, 6, 49, 8)
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = "divime_wordcount(speech_annos = ...)", cex = textcex, family = "mono")

cds <- c(41, 10, 49, 14)
labs <- "WCE_noisemesSad_X.rttm\nWCE_opensmileSad_X.rttm\nWCE_tocomboSad_X.rttm\nWCE_yunitator_english_X.rttm"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

cds <- c(41, 16, 49, 20)
labs <- "WCE_from_SAD_outputs.sh ... noisemesSad\nWCE_from_SAD_outputs.sh ... opensmileSad\nWCE_from_SAD_outputs.sh ... tocomboSad\nWCE_from_SAD_outputs.sh ... english"
if(r) rect(cds[1], cds[2], cds[3], cds[4])
text(x = mean(cds[c(1, 3)]), y = mean(cds[c(2, 4)]), labels = labs, cex = textcex)

Here is a tabular overview.

headers <- c("task", "function", "main argument", "requires", "output template", "comment")

sad1 <- c("**speech detection**", "`divime_sad`", "`module = \"noisemes\"`", " ", "noisemesSad_X", "")
sad2 <- c(" ", " ",                           "`module = \"opensmile\"`", " ", "opensmileSad_X", "")
sad3 <- c(" ", " ",                           "`module = \"tocombo\"`", " ", "tocomboSad_X", "")

diar1 <- c("**talker diarization**", "`divime_diarization`", "`speech_annos = \"noisemes\"`", "noisemesSad_X", "diartk_noisemesSad_X", "identification of individuals speakers")
diar2 <- c(" ", " ",                                     "`speech_annos = \"opensmile\"`", "opensmileSad_X", "diartk_noisemesSad_X", "")
diar3 <- c(" ", " ",                                     "`speech_annos = \"tocombo\"`", "tocomboSad_X", "diartk_tocomboSad_X", "")

yuni1 <- c("**talker type**", "`divime_talkertype`", "`marvinator = FALSE`", " ", "yunitator_old_X", "CHI vs FEM vs MAL")
yuni2 <- c(" ", " ",                             "`marvinator = TRUE`", " ", "yunitator_english_X", "")

vcm1 <- c("**vocalization classification**", "`divime_classify_vox`", "`marvinator = FALSE`", "yunitator_old_X", "vcm_X", "CNS vs NCS vs CRY vs OTH")
vcm2 <- c(" ", " ",                                               "`marvinator = TRUE`", "yunitator_english_X", "vcm_marvinator_X", "")


wce1 <- c("**word count**", "`divime_wordcount`", "`speech_annos = \"noisemes\"`", "noisemesSad_X", "WCE_noisemesSad_X", "")
wce2 <- c(" ", " ",                           "`speech_annos = \"opensmile\"`", "opensmileSad_X", "WCE_tocomboSad_", "")
wce3 <- c(" ", " ",                           "`speech_annos = \"tocombo\"`", "tocomboSad_X", "WCE_tocomboSad_X", "")
wce4 <- c(" ", " ",                           "`speech_annos = \"yunitator_english\"`", "yunitator_english_X", "WCE_yunitator_english_X", "")

xtab <- rbind(sad1, sad2, sad3, diar1, diar2, diar3, yuni1, yuni2, vcm1, vcm2, wce1, wce2, wce3, wce4)
colnames(xtab) <- headers
knitr::kable(xtab,row.names = FALSE)

Set up

Requirements

What you need as requirement is the vagrant software and the VirtualBox installed, which provide the environment in which you can set up and run your virtual machine, which in effect gives you access to the DiViMe tools.

If you are on Windows you might also need git. If you are on a Mac chances are you have got git already.

For more details see https://divime.readthedocs.io/en/latest/.

For Windows users: whenever file paths need to be specified please use single forward slashes, e.g. c:/data/myaudiofiles

First off, you will need the following R packages installed:

xml2
tuneR
parallel if you want to use this package to set up the DiViMe virtual machine from scratch
shiny and shinyFiles if you want to use the shiny interactive user interface

Regarding the DiViMe tools themselves, you will require a fair amount of disk space on the drive where you are going to install the virtual machine (25GB+).

You also need sufficient working memory, at least 8GB.

The DiViMe documentation also lists at least two processor cores as requirement but allows adapting in case your computer only has one core. The function to set up the VM (described below) will check how many cores you have and adapt the installation process if necessary.

Regardless of whether you are using Windows or MacOS you will need vagrant and VirtualBox. I can't be sure whether the package works with all combinations of versions of these two programs, so if you want to play it safe, you use vagrant 2.2.4 and VirtualBox 5.2.28, which are the versions that I used while writing this package.

If you use Windows, you probably also need git (https://git-scm.com/download/win), and I expect that version differences here don't make any difference so take the most recent version.

If you use a Mac, you might need to install Xcode in order to get access to git, but I'm not sure about this. So, try the test first that is mentioned just below and only if that fails, go ahead and install Xcode.

Once you installed everything, you can check whether so far all is set up correctly by using (you probably need to restart your computer though):

system2(Sys.which("vagrant"), "-v")
system2(Sys.which("git"), "--version")

If this results in some meaningful output, i.e. you get some info about about the versions installed, then you are good to go.

Set up the DiViMe virtual machine

This step describes the function that sets up the virtual machine. You can also omit this step and follow the instructions described here: https://divime.readthedocs.io/en/latest/install.html. In either case you need a running internet connection. If you already have a virtual machine installed, you can skip this section and go ahead to the next about testing whether your virtual machine can be accessed from R.

The function divime_vagrant_setup will take care of these steps without need for using bash commands. There are three things to note here:

You need to choose a location where to install the virtual machine. (remember that the drive needs plenty free space)
By default, the virtual machine gets allocated 2GB of RAM. However, this might be not be enough to run all the DiViMe tools. You can (and probably should) set the memory higher with the argument memoryoverride, to about half of the RAM that your computer has. For example, if you have 8GB of RAM, set memoryoverride = 4096.
Finally, note that the process of setting up the virtual machine will take quite some time: expect at least 20 minutes, up to an hour.

# with a Mac
divime_vagrant_setup(divime_loc = "/Volumes/Data/VMX", memoryoverride = 4096)
# with Windows
divime_vagrant_setup(divime_loc = "C:/Data/VMX", memoryoverride = 4096)

Testing your setup

Testing the virtual machine

Once the setup step is completed, let's test whether we can communicate with the virtual machine from within R. For this we use divime_vagrant_state(), which allows checking or changing the state of the virtual machine. Note that you need to ammend the path of the installation with DiViMe/.

# with a Mac
divime_vagrant_state(divime_loc = "/Volumes/Data/VMX/DiViMe/", what = "status")
# with Windows
divime_vagrant_state(divime_loc = "C:/Data/VMX/DiViMe/", what = "status")

If this results in a printed message "running (virtualbox)", things are looking good. If you don't get this message, try:

# with a Mac
divime_vagrant_state(divime_loc = "/Volumes/Data/VMX/DiViMe/", what = "start")
# with Windows
divime_vagrant_state(divime_loc = "C:/Data/VMX/DiViMe/", what = "start")

Testing the DiViMe tools

If the above steps were successful, you can run the test protocol of the DiViMe package. Running this will take a few minutes because it will download some example data.

# with a Mac
divime_test(divime_loc = "/Volumes/Data/VMX/DiViMe/")
# with Windows
divime_test(divime_loc = "C:/Data/VMX/DiViMe/")

This step downloads a sound file from TalkBank and runs this file (a 5-minute segment) through most of the tools in the DiViMe package. The output of this function should be self-explanatory if successful.

Using the `divime_` functions

Currently, there are the following functions available to run specific DiViMe tools:

divime_sad(): runs SAD modules with the option to choose between opensmile, noisemes and tocombo
divime_diarization(): runs diartk with your choice of existing SAD output
divime_talkertype(): runs yunitate
divime_classify_vox(): runs vcm
divime_fullnoisemes(): runs the noisemes module but outputs all found noiseme types (including non-speech)
divime_wordcount(): runs wce (word count), which requires speech detection output from either divime_sad() or divime_talkertype(marvinator = TRUE, ...)
divime_run_all(): runs all the above tools in one go

The general idea of how these functions work is the same for all:

Specify where your VM is located.
Specify a location where you have your audio files stored.
for divime_sad() you need to specify which SAD module you wish to run with the module= argument
for divime_diarization() you need to specify what kind of existing SADs you want to use with the speech_annos= argument (sorry for the different argument names, I will eventually make them the same for both functions...)

What the functions then do is to copy your audio file to your DiViMe location, have the DiViMe tools process them there, and copy the output (.rttm file(s)) back to the location of your audio files. After that is done, the files will be deleted from the DiViMe location again.

During these steps there are again a few things to note:

one major difference of this package to the instructions of the DiViMe docs is that you can keep your files wherever you want, i.e. you don't need to copy them manually to the DiViMe folder...
one consequence of this is that you should in fact store your files outside the DiViMe location because files in the DiViMe/data folder will be deleted...
also note that if you run any of the divime_ functions, the default is that any existing output is not overwritten if it already exists. You can/have to (if desired) force overwriting of existing results (i.e. .rttm files) by setting the overwrite= argument to TRUE.

In practice, working with the divime_ functions could look like this.

# Mac
myaudio <- "/Volumes/Data/audio"      # location for audio files
divime <- "/Volumes/Data/VMX/DiViMe/" # location for DiViMe VM

# Windows
myaudio <- "D:/Data/audio"      # location for audio files
divime <- "C:/Data/VMX/DiViMe/" # location for DiViMe VM

If your audio location contains two .wav files, e.g. 'test1.wav' and test2.wav', running the following command will add two more files to this location ('opensmileSad_test1.rttm' and 'opensmileSad_test2.rttm').

divime_sad(divime_loc = divime, audio_loc = myaudio, module = "opensmile")

If you then want to run the diarization tool, the following function will add two more files to your audio location ('diartk_opensmileSad_test1.rttm' and 'diartk_opensmileSad_test2.rttm').

divime_diarization(divime_loc = divime, audio_loc = myaudio, speech_annos = "opensmile")

Let's go for a proper example, i.e. using two of the example files provided in the package. These short files contain a few sentences spoken by a voice synthesizer. We start by setting up temporary folder and copy the sound files there. (Note that I use the R temporary file folder for this, which is deleted once you close R).

tdir <- file.path(normalizePath(tempdir(), winslash = "/"), "avutils")
dir.create(tdir)
file.copy(from = system.file("synthetic_speech.wav", package = "avutils"),
          to = file.path(tdir, "synthetic_speech.wav"))
file.copy(from = system.file("synthetic_speech_overlap.wav", package = "avutils"),
          to = file.path(tdir, "synthetic_speech_overlap.wav"))

# open location in file browser
shell.exec(tdir) # Windows
system2("open", tdir) # Mac

file.copy(from = system.file("opensmileSad_synthetic_speech.rttm", package = "avutils"),
          to = file.path(tdir, "opensmileSad_synthetic_speech.rttm"))
file.copy(from = system.file("opensmileSad_synthetic_speech_overlap.rttm", package = "avutils"),
          to = file.path(tdir, "opensmileSad_synthetic_speech_overlap.rttm"))
file.copy(from = system.file("noisemesSad_synthetic_speech.rttm", package = "avutils"),
          to = file.path(tdir, "noisemesSad_synthetic_speech.rttm"))
file.copy(from = system.file("noisemesSad_synthetic_speech_overlap.rttm", package = "avutils"),
          to = file.path(tdir, "noisemesSad_synthetic_speech_overlap.rttm"))

Now let's run two speech detection algorithms on the two files in this folder.

divime_sad(audio_loc = tdir, divime_loc = "/Volumes/Data/VMX/DiViMe/", module = "opensmile", vmshutdown = FALSE)
divime_sad(audio_loc = tdir, divime_loc = "/Volumes/Data/VMX/DiViMe/", module = "noisemes", vmshutdown = FALSE)

This should lead to four .rttm files being created in the data folder:

list.files(tdir)

Now let's read back one of these files.

file1 <- read_rttm(file.path(tdir, "opensmileSad_synthetic_speech.rttm"))
file1

Please note that during the reading with read_rttm() I assign column names that are not present in the actual .rttm file and I create an extra column to set the end time of each segment.

cat(paste(readLines(file.path(tdir, "opensmileSad_synthetic_speech.rttm")), collapse = "\n"))

DiViMe shiny app

The app is still in an early development stage.

divime_app() # starts app in your default browser
divime_app(launchinbrowser = FALSE) # starts app in RStudio window (if you use RStudio)

Evaluation of speech detection (NEEDS MAJOR REWORKING!!!)

We start by providing the paths to two annotation files (one manually coded with ELAN, one the output from noisemes), which are part of the package and relate to an audio file that I made up with synthetic speech.

# a 'test' file
x1 <- system.file("noisemesSad_synthetic_speech.rttm", package = "avutils")
# the gold standard reference (manual coding)
x2 <- system.file("synthetic_speech.eaf", package = "avutils")

The first example is 'normal' in that both files contain detected speech and the overlap between the two annotation sets is fairly good.

evaluate_sad(test = x1, reference = x2, overlap_res = 500)

Let's first consider the graphical output. We see two sets of lines, one black and one gold, which refer to the 'test' file (x1) and the reference file (or the 'gold standard', x2 in our case). The lines represent the speech detected in both annotation sets. When you use ELAN files, note that the function will take all annotations from all tiers and consider them speech. If you want to exclude certain annotations or tiers, you can exclude them by providing the relevant arguments (see ?collapse_tiers for more details on this). On the left side, we can see the time stamps (in seconds) that are covered.

The next thing to note is the top horizontal axis (evaluation points). The corresponding argument (overlap_res=) controls how many evaluation points are used. At each evaluation point, the function compares the two annotation sets with respect to whether an annotation exists at this point or not. This is somewhat contrasting the evaluation pipeline in the Python version, where each individual sample is evaluated.

The last type of information provided by the graphic are red circles between the two line sets. These circles (if present) indicate a mismatch between the two annotation sets, i.e. one file has speech detected at this point while the other has not.

The numerical output provides three types of information. First, the false alarm rate, which takes all evaluation points that show no speech in the standard x2 and calculates the proportion of evaluation points that (falsly) contain speech in the test file x1. The missed speech rate is the opposite, that is the proportion of no-speech points in x1 when there is speech in the standard x2. Finally, there is the total congruence, which is just the total proportion of all evaluation points where x1 has the same value as x2. So, for FA and MSR, small values close to 0 are better, while for the total 1 is the best value.

For a proper evaluation, keep in mind to set overlap_res= to a reasonably large value. One rule of thumb could be to use ten evaluation points per second recording, which would provide precision to the tenth of a second.

Visual versus numerical results

In order to aid visual assessment, evaluate_sad() allows to break the display into multiple rows and/or to select a time segment within the file. Note that when you 'break' the file over multiple segments, all the evaluation metrics are returned for each segment separately. In the specific example that follows, note that in the last segment, there are no false alarms at all.

# break the display over 5 segments
evaluate_sad(test = x1, reference = x2, nsegments = 5, overlap_res = 100)

You can also select a time window. (the example here corresponds roughly to the second segment from just above).

evaluate_sad(test = x1, reference = x2, from = 9.46, to = 18.94, overlap_res = 100)

Special cases

The DiViMe manual considers special cases when one or two of the files submitted contain no speech. Let's look at this with an example where we use predefined segments of the example files.

First, we consider the special case where the gold/manual annotation is empty and also the SAD module detected no speech.

evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 27, to = 28, overlap_res = 10)

As expected, both FA and MSR are 0 (as defined in the DiViMe manual).

Second, we consider the next case where the gold/manual annotation is empty but the SAD module detected some speech.

evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 27, to = 28.3, overlap_res = 10)

Here we see that the false alarm rate is 1 while the missed speech rate is set to 0.

And finally, the case where there was speech in the gold standard, but the SAD module didn't detect anything:

evaluate_sad(test = x1, reference = x2, nsegments = 1, from = 23.2, to = 25, overlap_res = 10)

Here, FA = 0 and MSR is 1.

splitting files

par(mfrow = c(1, 3), family = "serif")
evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 27, to = 28, overlap_res = 10)
evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 27, to = 28.3, overlap_res = 10)
evaluate_sad(test = x1, reference = x2, nsegments = 2, from = 23.2, to = 25, overlap_res = 10)

gobbios/avutils documentation built on Feb. 19, 2020, 9:44 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

gobbios/avutils
various utilities for DiViMe tools and audio processing

In gobbios/avutils: various utilities for DiViMe tools and audio processing

Overview

Set up

Requirements

Set up the DiViMe virtual machine

Testing your setup

Testing the virtual machine

Testing the DiViMe tools

Using the `divime_` functions

DiViMe shiny app

Evaluation of speech detection (NEEDS MAJOR REWORKING!!!)

Visual versus numerical results

Special cases

splitting files

R Package Documentation

Browse R Packages

We want your feedback!

gobbios/avutils various utilities for DiViMe tools and audio processing

In gobbios/avutils: various utilities for DiViMe tools and audio processing

Overview

Set up

Requirements

Set up the DiViMe virtual machine

Testing your setup

Testing the virtual machine

Testing the DiViMe tools

Using the divime_ functions

DiViMe shiny app

Evaluation of speech detection (NEEDS MAJOR REWORKING!!!)

Visual versus numerical results

Special cases

splitting files

R Package Documentation

Browse R Packages

We want your feedback!

gobbios/avutils
various utilities for DiViMe tools and audio processing

Using the `divime_` functions