#' Layer Spectrogram
#'
#' Computes the spectrogram of a signal using the STFT implemented in
#' tf.contrib.signal.
#'
#' It only works with the TensorFlow backend.
#'
#' @section Input shape:
#'
#' 3D tensor with shape: (samples, channels, audio_samples) if
#' data_format='channels_first' or 3D tensor with shape: (samples, audio_samples, channels)
#' if data_format='channels_last'.
#'
#' @section Output shape:
#'
#' 4D tensor with shape: (samples, frames, fft_unique_bins, channels) if
#' data_format='channels_last' or 4D tensor with shape: (samples, channels, frames, fft_unique_bins)
#' if data_format='channels_last'.
#'
#' @param object Model or layer object
#' @param frame_length The window length in samples.
#' @param frame_step The number of samples to step.
#' @param fft_length The size of the FFT to apply. If not provided, uses the
#' smallest power of 2 enclosing frame_length.
#' @param pad_end Whether to pad the end of signals with zeros when the provided
#' frame length and step produces a frame that lies partially past its end.
#' @param mode The mode of the spectrogram. Options are 'complex', 'power' or
#' 'magnitude'. The 'power' spectrogram is the squared magnitude of the complex-valued STFT.
#' A 'maginitude' spectrogram is the magnitude of the complex-valued STFT.
#' 'complex' returns the output of stft.
#' @param log_compress (TRUE/FALSE) It is common practice to apply a compressive nonlinearity
#' such as a logarithm or power-law compression to spectrograms. This helps to
#' balance the importance of detail in low and high energy regions of the spectrum,
#' which more closely matches human auditory sensitivity.
#' @param log_offset When compressing with a logarithm, it's a good idea to use
#' a stabilizing offset to avoid high dynamic ranges caused by the singularity
#' at zero.
#' @param name An optional name string for the layer. Should be unique in a
#' model (do not reuse the same name twice). It will be autogenerated if it
#' isn't provided.
#'
#' @examples
#' \dontrun{
#' library(keras)
#' library(kextra)
#' input <- layer_input(shape = c(16000, 1))
#' output <- layer_spectrogram(input, 100, 10)
#' }
#' @family audio
#'
#' @export
layer_spectrogram <- function(object, frame_length, frame_step, fft_length = NULL,
pad_end = FALSE, mode = "power", log_compress = FALSE,
log_offset = 1e-6, name = NULL) {
create_layer(Spectrogram, object, list(
frame_length = as.integer(frame_length),
frame_step = as.integer(frame_step),
fft_length = as_nullable_integer(fft_length),
pad_end = pad_end,
mode = mode,
log_compress = log_compress,
log_offset = log_offset,
name = name,
trainable = FALSE
))
}
Spectrogram <- R6::R6Class(
"Spectrogram",
inherit = keras::KerasLayer,
public = list(
frame_length = NULL,
frame_step = NULL,
fft_length = NULL,
pad_end = NULL,
mode = NULL,
log_compress = NULL,
log_offset = NULL,
initialize = function(frame_length, frame_step, fft_length, pad_end, mode,
log_compress, log_offset = 1e-6) {
if (keras::k_backend() != "tensorflow")
stop("Layer spectrogram is only implement for the tensorflow backend but is ", keras::k_backend())
self$frame_length <- frame_length
self$frame_step <- frame_step
if (is.null(fft_length))
self$fft_length <- as.integer(2^trunc(log(frame_length, 2) + 1))
else
self$fft_length <- fft_length
self$pad_end <- pad_end
self$mode <- mode
self$log_compress <- log_compress
self$log_offset = log_offset
},
call = function(x, mask = NULL) {
if (keras::k_image_data_format() == "channels_last")
x <- keras::k_permute_dimensions(x, c(1L, 3L, 2L))
out <- tf$contrib$signal$stft(
x,
frame_length = self$frame_length,
frame_step = self$frame_step,
fft_length = self$fft_length,
pad_end = self$pad_end
)
if (self$mode == "power")
out <- tf$real(out * tf$conj(out))
else if (self$mode == "magnitude")
out <- tf$abs(out)
if (self$log_compress)
out <- tf$log(out + self$log_offset)
if (keras::k_image_data_format() == "channels_last")
out <- keras::k_permute_dimensions(out, c(1L, 3L, 4L, 2L))
out
},
compute_output_shape = function(input_shape) {
samples <- input_shape[[1]]
if (keras::k_image_data_format() == "channels_first") {
channels <- input_shape[[2]]
n_frames <- (input_shape[[3]] - self$frame_length) %/% self$frame_step + 1L
} else if (keras::k_image_data_format() == "channels_last") {
channels <- input_shape[[3]]
n_frames <- (input_shape[[2]] - self$frame_length) %/% self$frame_step + 1L
}
fft_unique_bins <- as.integer(self$fft_length %/% 2 + 1)
if (keras::k_image_data_format() == "channels_first")
list(samples, channels, n_frames, fft_unique_bins)
else if (keras::k_image_data_format() == "channels_last")
list(samples, n_frames, fft_unique_bins, channels)
}
)
)
#' Layer Mel-Spectrogram
#'
#' Reweights the spectrogram to the mel-scale. Uses tf$contrib$signal$linear_to_mel_weight_matrix
#' to compute the matrix.
#'
#' It only works with the TensorFlow backend.
#'
#' @section Input shape:
#'
#' 4D tensor with shape (a spectrogram): (samples, channels, frames, fft_unique_bins) if
#' data_format='channels_first' or 4D tensor with shape: (samples, frames, fft_unique_bins, channels)
#' if data_format='channels_last'.
#'
#' @section Output shape:
#'
#' 4D tensor with shape: (samples, frames, num_mel_bins, channels) if
#' data_format='channels_last' or 4D tensor with shape: (samples, channels, frames, num_mel_bins)
#' if data_format='channels_last'.
#'
#' @param num_mel_bins How many bands in the resulting mel spectrum.
#' @param sample_rate Samples per second of the input signal used to create the spectrogram.
#' We need this to figure out the actual frequencies for each spectrogram bin,
#'which dictates how they are mapped into the mel scale.
#' @param lower_edge_hertz: Lower bound on the frequencies to be included in the
#' mel spectrum. This corresponds to the lower edge of the lowest triangular band.
#' @param upper_edge_hertz: The desired top edge of the highest frequency band.
#' @param log_compress (TRUE/FALSE) It is common practice to apply a compressive nonlinearity
#' such as a logarithm or power-law compression to spectrograms. This helps to
#' balance the importance of detail in low and high energy regions of the spectrum,
#' which more closely matches human auditory sensitivity.
#' @param log_offset When compressing with a logarithm, it's a good idea to use
#' a stabilizing offset to avoid high dynamic ranges caused by the singularity
#' at zero.
#' @param name An optional name string for the layer. Should be unique in a
#' model (do not reuse the same name twice). It will be autogenerated if it
#' isn't provided.
#'
#' @examples
#' \dontrun{
#' library(keras)
#' library(kextra)
#' input <- layer_input(shape = c(16000, 1))
#' output <- layer_spectrogram(input, 100, 10) %>% layer_mel_spectrogram(10)
#' }
#' @family audio
#'
#' @export
layer_mel_spectrogram <- function(object, num_mel_bins = 128, sample_rate = 16000,
lower_edge_hertz = 0, upper_edge_hertz = 7400,
log_compress = TRUE, log_offset = 1e-6,
name = NULL) {
create_layer(MelSpectrogram, object, list(
num_mel_bins = num_mel_bins,
sample_rate = sample_rate,
lower_edge_hertz = lower_edge_hertz,
upper_edge_hertz = upper_edge_hertz,
log_compress = log_compress,
log_offset = log_offset,
name = name,
trainable = FALSE
))
}
MelSpectrogram <- R6::R6Class(
"MelSpectrogram",
inherit = keras::KerasLayer,
public = list(
num_mel_bins = NULL,
sample_rate = NULL,
lower_edge_hertz = NULL,
upper_edge_hertz = NULL,
log_compress = NULL,
log_offset = NULL,
initialize = function (num_mel_bins, sample_rate, lower_edge_hertz,
upper_edge_hertz, log_compress, log_offset) {
self$num_mel_bins <- as.integer(num_mel_bins)
self$sample_rate <- as.integer(sample_rate)
self$lower_edge_hertz <- as.integer(lower_edge_hertz)
self$upper_edge_hertz <- as.integer(upper_edge_hertz)
self$log_compress <- log_compress
self$log_offset <- log_offset
},
call = function(x, mask = NULL) {
if (keras::k_image_data_format() == "channels_last")
x <- keras::k_permute_dimensions(x, c(1L, 4L, 2L, 3L))
num_spectrogram_bins <- x$get_shape()$as_list()[[4]]
w_matrix <- tf$contrib$signal$linear_to_mel_weight_matrix(
num_mel_bins = self$num_mel_bins,
num_spectrogram_bins = num_spectrogram_bins,
sample_rate = self$sample_rate,
lower_edge_hertz = self$lower_edge_hertz,
upper_edge_hertz = self$upper_edge_hertz
)
out <- tf$tensordot(x, w_matrix, 1L)
if (self$log_compress)
out <- tf$log(out + self$log_offset)
if (keras::k_image_data_format() == "channels_last")
out <- keras::k_permute_dimensions(out, c(1L, 3L, 4L, 2L))
out
},
compute_output_shape = function(input_shape) {
samples <- input_shape[[1]]
if (keras::k_image_data_format() == "channels_first") {
channels <- input_shape[[2]]
n_frames <- input_shape[[3]]
} else if (keras::k_image_data_format() == "channels_last") {
channels <- input_shape[[3]]
n_frames <- input_shape[[2]]
}
n_frames <- input_shape[[3]]
n_bins <- self$num_mel_bins
if (keras::k_image_data_format() == "channels_first")
list(samples, channels, n_frames, n_bins)
else if (keras::k_image_data_format() == "channels_last")
list(samples, n_frames, n_bins, channels)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.