kextra: Extra Layers for Keras in R

#' Layer Spectrogram
#'
#' Computes the spectrogram of a signal using the STFT implemented in
#' tf.contrib.signal.
#'
#' It only works with the TensorFlow backend.
#'
#' @section Input shape:
#'
#'   3D tensor with shape: (samples, channels, audio_samples) if
#'   data_format='channels_first' or 3D tensor with shape: (samples, audio_samples, channels)
#'   if data_format='channels_last'.
#'
#' @section Output shape:
#'
#'   4D tensor with shape: (samples, frames, fft_unique_bins, channels) if
#'   data_format='channels_last' or 4D tensor with shape: (samples, channels, frames, fft_unique_bins)
#'   if data_format='channels_last'.
#'
#' @param object Model or layer object
#' @param frame_length The window length in samples.
#' @param frame_step The number of samples to step.
#' @param fft_length The size of the FFT to apply. If not provided, uses the
#' smallest power of 2 enclosing frame_length.
#' @param pad_end Whether to pad the end of signals with zeros when the provided
#' frame length and step produces a frame that lies partially past its end.
#' @param mode The mode of the spectrogram. Options are 'complex', 'power' or
#' 'magnitude'. The 'power' spectrogram is the squared magnitude of the complex-valued STFT.
#' A 'maginitude' spectrogram is the magnitude of the complex-valued STFT.
#' 'complex' returns the output of stft.
#' @param log_compress (TRUE/FALSE) It is common practice to apply a compressive nonlinearity
#' such as a logarithm or power-law compression to spectrograms. This helps to
#' balance the importance of detail in low and high energy regions of the spectrum,
#' which more closely matches human auditory sensitivity.
#' @param log_offset When compressing with a logarithm, it's a good idea to use
#' a stabilizing offset to avoid high dynamic ranges caused by the singularity
#' at zero.
#' @param name An optional name string for the layer. Should be unique in a
#' model (do not reuse the same name twice). It will be autogenerated if it
#' isn't provided.
#'
#' @examples
#' \dontrun{
#' library(keras)
#' library(kextra)
#' input <- layer_input(shape = c(16000, 1))
#' output <- layer_spectrogram(input, 100, 10)
#' }
#' @family audio
#'
#' @export
layer_spectrogram <- function(object, frame_length, frame_step, fft_length = NULL,
                              pad_end = FALSE, mode = "power", log_compress = FALSE,
                              log_offset = 1e-6, name = NULL) {
  create_layer(Spectrogram, object, list(
    frame_length = as.integer(frame_length),
    frame_step = as.integer(frame_step),
    fft_length = as_nullable_integer(fft_length),
    pad_end = pad_end,
    mode = mode,
    log_compress = log_compress,
    log_offset = log_offset,
    name = name,
    trainable = FALSE
  ))
}

Spectrogram <- R6::R6Class(
  "Spectrogram",

  inherit = keras::KerasLayer,

  public = list(

    frame_length = NULL,
    frame_step = NULL,
    fft_length = NULL,
    pad_end = NULL,
    mode = NULL,
    log_compress = NULL,
    log_offset = NULL,

    initialize = function(frame_length, frame_step, fft_length, pad_end, mode,
                          log_compress, log_offset = 1e-6) {

      if (keras::k_backend() != "tensorflow")
        stop("Layer spectrogram is only implement for the tensorflow backend but is ", keras::k_backend())

      self$frame_length <- frame_length
      self$frame_step <- frame_step

      if (is.null(fft_length))
        self$fft_length <- as.integer(2^trunc(log(frame_length, 2) + 1))
      else
        self$fft_length <- fft_length

      self$pad_end <- pad_end
      self$mode <- mode
      self$log_compress <- log_compress
      self$log_offset = log_offset
    },

    call = function(x, mask = NULL) {

      if (keras::k_image_data_format() == "channels_last")
        x <- keras::k_permute_dimensions(x, c(1L, 3L, 2L))

      out <- tf$contrib$signal$stft(
        x,
        frame_length = self$frame_length,
        frame_step = self$frame_step,
        fft_length = self$fft_length,
        pad_end = self$pad_end
      )

      if (self$mode == "power")
        out <- tf$real(out * tf$conj(out))
      else if (self$mode == "magnitude")
        out <- tf$abs(out)

      if (self$log_compress)
        out <- tf$log(out + self$log_offset)

      if (keras::k_image_data_format() == "channels_last")
        out <- keras::k_permute_dimensions(out, c(1L, 3L, 4L, 2L))

      out
    },

    compute_output_shape = function(input_shape) {

      samples <- input_shape[[1]]

      if (keras::k_image_data_format() == "channels_first") {
        channels <- input_shape[[2]]
        n_frames <- (input_shape[[3]] - self$frame_length) %/% self$frame_step + 1L
      } else if (keras::k_image_data_format() == "channels_last") {
        channels <- input_shape[[3]]
        n_frames <- (input_shape[[2]] - self$frame_length) %/% self$frame_step + 1L
      }

      fft_unique_bins <- as.integer(self$fft_length %/% 2 + 1)

      if (keras::k_image_data_format() == "channels_first")
        list(samples, channels, n_frames, fft_unique_bins)
      else if (keras::k_image_data_format() == "channels_last")
        list(samples, n_frames, fft_unique_bins, channels)
    }

  )
)

#' Layer Mel-Spectrogram
#'
#' Reweights the spectrogram to the mel-scale. Uses tf$contrib$signal$linear_to_mel_weight_matrix
#' to compute the matrix.
#'
#' It only works with the TensorFlow backend.
#'
#' @section Input shape:
#'
#'   4D tensor with shape (a spectrogram): (samples, channels, frames, fft_unique_bins) if
#'   data_format='channels_first' or 4D tensor with shape: (samples, frames, fft_unique_bins, channels)
#'   if data_format='channels_last'.
#'
#' @section Output shape:
#'
#'   4D tensor with shape: (samples, frames, num_mel_bins, channels) if
#'   data_format='channels_last' or 4D tensor with shape: (samples, channels, frames, num_mel_bins)
#'   if data_format='channels_last'.
#'
#' @param num_mel_bins How many bands in the resulting mel spectrum.
#' @param sample_rate Samples per second of the input signal used to create the spectrogram.
#' We need this to figure out the actual frequencies for each spectrogram bin,
#'which dictates how they are mapped into the mel scale.
#' @param lower_edge_hertz: Lower bound on the frequencies to be included in the
#' mel spectrum. This corresponds to the lower edge of the lowest triangular band.
#' @param upper_edge_hertz: The desired top edge of the highest frequency band.
#' @param log_compress (TRUE/FALSE) It is common practice to apply a compressive nonlinearity
#' such as a logarithm or power-law compression to spectrograms. This helps to
#' balance the importance of detail in low and high energy regions of the spectrum,
#' which more closely matches human auditory sensitivity.
#' @param log_offset When compressing with a logarithm, it's a good idea to use
#' a stabilizing offset to avoid high dynamic ranges caused by the singularity
#' at zero.
#' @param name An optional name string for the layer. Should be unique in a
#' model (do not reuse the same name twice). It will be autogenerated if it
#' isn't provided.
#'
#' @examples
#' \dontrun{
#' library(keras)
#' library(kextra)
#' input <- layer_input(shape = c(16000, 1))
#' output <- layer_spectrogram(input, 100, 10) %>% layer_mel_spectrogram(10)
#' }
#' @family audio
#'
#' @export
layer_mel_spectrogram <- function(object, num_mel_bins = 128, sample_rate = 16000,
                                  lower_edge_hertz = 0, upper_edge_hertz = 7400,
                                  log_compress = TRUE, log_offset = 1e-6,
                                  name = NULL) {

  create_layer(MelSpectrogram, object, list(
    num_mel_bins = num_mel_bins,
    sample_rate = sample_rate,
    lower_edge_hertz = lower_edge_hertz,
    upper_edge_hertz = upper_edge_hertz,
    log_compress = log_compress,
    log_offset = log_offset,
    name = name,
    trainable = FALSE
  ))
}

MelSpectrogram <- R6::R6Class(
  "MelSpectrogram",
  inherit = keras::KerasLayer,

  public = list(

    num_mel_bins = NULL,
    sample_rate = NULL,
    lower_edge_hertz = NULL,
    upper_edge_hertz = NULL,
    log_compress = NULL,
    log_offset = NULL,

    initialize = function (num_mel_bins, sample_rate, lower_edge_hertz,
                           upper_edge_hertz, log_compress, log_offset) {

      self$num_mel_bins <- as.integer(num_mel_bins)
      self$sample_rate <- as.integer(sample_rate)
      self$lower_edge_hertz <- as.integer(lower_edge_hertz)
      self$upper_edge_hertz <- as.integer(upper_edge_hertz)
      self$log_compress <- log_compress
      self$log_offset <- log_offset

    },

    call = function(x, mask = NULL) {

      if (keras::k_image_data_format() == "channels_last")
        x <- keras::k_permute_dimensions(x, c(1L, 4L, 2L, 3L))

      num_spectrogram_bins <- x$get_shape()$as_list()[[4]]

      w_matrix <- tf$contrib$signal$linear_to_mel_weight_matrix(
        num_mel_bins = self$num_mel_bins,
        num_spectrogram_bins = num_spectrogram_bins,
        sample_rate = self$sample_rate,
        lower_edge_hertz = self$lower_edge_hertz,
        upper_edge_hertz = self$upper_edge_hertz
      )

      out <- tf$tensordot(x, w_matrix, 1L)

      if (self$log_compress)
        out <- tf$log(out + self$log_offset)

      if (keras::k_image_data_format() == "channels_last")
        out <- keras::k_permute_dimensions(out, c(1L, 3L, 4L, 2L))

      out
    },

    compute_output_shape = function(input_shape) {
      samples <- input_shape[[1]]

      if (keras::k_image_data_format() == "channels_first") {
        channels <- input_shape[[2]]
        n_frames <- input_shape[[3]]
      } else if (keras::k_image_data_format() == "channels_last") {
        channels <- input_shape[[3]]
        n_frames <- input_shape[[2]]
      }

      n_frames <- input_shape[[3]]
      n_bins <- self$num_mel_bins

      if (keras::k_image_data_format() == "channels_first")
        list(samples, channels, n_frames, n_bins)
      else if (keras::k_image_data_format() == "channels_last")
        list(samples, n_frames, n_bins, channels)
    }

  )
)