R/audio_cognizers.R

#' @title IBM Watson Audio Transcriber
#' @description Convert your audio to transcripts with optional keyword
#'   detection and profanity cleaning.
#' @param audios Character vector (list) of paths to images or to .zip files containing
#'   upto 100 images.
#' @param userpwd Character scalar containing username:password for the service.
#' @param keep_data Character scalar specifying whether to share your data with
#'   Watson services for the purpose of training their models.
#' @param callback Function that can be applied to responses to examine http status,
#'   headers, and content, to debug or to write a custom parser for content.
#'   The default callback parses content into a data.frame while dropping other
#'   response values to make the output easily passable to tidyverse packages like
#'   dplyr or ggplot2. For further details or debugging one can pass a print or a
#'   more compicated function.
#' @param model Character scalar specifying language and bandwidth model. Alternatives
#'   are ar-AR_BroadbandModel, en-UK_BroadbandModel, en-UK_NarrowbandModel,
#'   en-US_NarrowbandModel, es-ES_BroadbandModel, es-ES_NarrowbandModel,
#'   fr-FR_BroadbandModel, ja-JP_BroadbandModel, ja-JP_NarrowbandModel,
#'   pt-BR_BroadbandModel, pt-BR_NarrowbandModel, zh-CN_BroadbandModel,
#'   zh-CN_NarrowbandModel.
#' @param inactivity_timeout Integer scalar giving the number of seconds after which
#'   the result is returned if no speech is detected.
#' @param keywords List of keywords to be detected in the speech stream.
#' @param keywords_threshold Double scalar from 0 to 1 specifying the lower bound on
#'   confidence to accept detected keywords in speech.
#' @param max_alternatives Integer scalar giving the maximum number of alternative
#'   transcripts to return.
#' @param word_alternatives_threshold Double scalar from 0 to 1 giving lower bound
#'   on confidence of possible words.
#' @param word_confidence Logical scalar indicating whether to return confidence for
#'   each word.
#' @param timestamps Logical scalar indicating whether to return time alignment for
#'   each word.
#' @param profanity_filter Logical scalar indicating whether to censor profane words.
#' @param smart_formatting Logical scalar indicating whether dates, times, numbers, etc.
#'   are to be formatted nicely in the transcript.
#' @param content_type Character scalar showing format of the audio file. Alternatives
#'   are audio/flac, audio/l16;rate=n;channels=k (16 channel limit),
#'   audio/wav (9 channel limit), audio/ogg;codecs=opus,
#'   audio/basic (narrowband models only).
#' @param speaker_labels Logical scalar indicating whether to infer speakers on a mono
#'   channel. Automatically turns on timestamp collection for each word. 
#' @return List of parsed responses.
#' @export
audio_text <- function(
  audios,
  userpwd,
  keep_data = "true",
  callback = NULL,
  model = "en-US_BroadbandModel",
  inactivity_timeout = -1,
  keywords = list(),
  keywords_threshold = NA,
  max_alternatives = 1,
  word_alternatives_threshold = NA,
  word_confidence = FALSE,
  timestamps = FALSE,
  profanity_filter = TRUE,
  smart_formatting = FALSE,
  content_type = "audio/wav",
  speaker_labels = FALSE)
{
  protocol <- "https://"
  service <- "stream.watsonplatform.net/speech-to-text/api/v1/recognize?"
  parameters <- paste("model", model, sep = "=")
  url <- paste0(protocol, service, parameters)
  metadata <- list(
    "part_content_type" = content_type,
    "data_parts_count" = 1,
    "inactivity_timeout" = inactivity_timeout,
    "keywords" = keywords,
    "keywords_threshold" = keywords_threshold,
    "max_alternatives" = max_alternatives,
    "word_alternatives_threshold" = word_alternatives_threshold,
    "word_confidence" = word_confidence,
    "timestamps" = timestamps,
    "profanity_filter" = profanity_filter,
    "smart_formatting" = smart_formatting,
    "speaker_labels" = speaker_labels
  )
  metadata <- toJSON(metadata[!is.na(metadata)], auto_unbox = TRUE)
  
  done <- if (is.null(callback)) function(resp, index) {
    resps[[index]] <<- fromJSON(rawToChar(resp$content))
    invisible(NULL)
  } else callback
  fail <- function(resp, index) {
    resps[[index]] <<- resp
    invisible(NULL)
  }
  
  resps <- vector("list", length(audios))
  invisible(
    lapply(
      seq_along(audios),
      function(index) {
        if (is.null(callback)) formals(done)$index <- index
        formals(fail)$index <- index
        form <- form_file(audios[index], content_type)
        new_handle(url = url) %>%
          handle_setopt("userpwd" = userpwd) %>%
          handle_setheaders(
            "X-Watson-Learning-Opt-Out"= keep_data,
            "Content-Type" = "multipart/form-data",
            "Transfer-Encoding" = "chunked"
          ) %>%
          handle_setform(metadata = metadata, upload = form) %>%
          multi_add(done = done, fail = fail)
      }
    )
  )
  
  multi_run()
  resps
}
DecisionSystems/cognizer documentation built on May 30, 2019, 11:40 a.m.