R/read_subtitles.R

Defines functions get_subtitles_info print.multisubtitles subtitles read_subtitles as_subtitle.default as_subtitle

Documented in as_subtitle as_subtitle.default get_subtitles_info print.multisubtitles read_subtitles subtitles

#' Convert an R object to a `subtitles` object
#'
#' @md
#' @param x an R object that can be coerced into a `subtitles` object
#' @param format a character string specifying the format of the subtitles.
#' Four formats can be read: \code{"subrip"}, \code{"substation"}, \code{"microdvd"}, \code{"subviewer"} (v.2) and \code{"webvtt"}.
#' Default is \code{"auto"} which tries to detect automatically the format of the file from its extension.
#' @param clean.tags logical. If \code{"TRUE"} (default), formatting tags are deleted from subtitles using \code{\link{clean_tags}}.
#' @param metadata a one-row dataframe or tibble, or any object that can be coerced
#' into a one-row tibble by \code{link[tibble]{as_tibble}}.
#' @param frame.rate a numeric value giving the frame rate in frames per second. Only relevant for MicroDVD format.
#' If \code{NA} (default), the function tries to extract the frame.rate from the file.
#' If it fails, the frame rate is set at 24p (23.976).
#' @param encoding the name of the encoding to be used. Default is "\code{auto}" and
#' uses \code{\link[readr]{guess_encoding}} to detect encoding.
#' @param ... passed on to downstream methods.
#'
#' @returns An object of class \code{subtitles} (see \code{\link{subtitles}}).
#'
#' @importFrom utils read.csv
#' @export
#' @examples
#' as_subtitle(
#'   c("WEBVTT",
#'     "X-TIMESTAMP-MAP=MPEGTS:181083,LOCAL:00:00:00.000",
#'     "",
#'     "3",
#'     "00:00:21.199 --> 00:00:22.333", ">> FEMALE SPEAKER:",
#'     "Don't stay up too late.",
#'     "",
#'     ""
#'   ), format = "webvtt"
#' )
as_subtitle <- function(
  x,
  format = "auto",
  clean.tags = TRUE,
  metadata = data.frame(),
  frame.rate = NA,
  encoding = "auto",
  ...
) {
  UseMethod("as_subtitle", x)
}

#' @rdname as_subtitle
#' @export
as_subtitle.default <- function(
  x,
  format = "auto",
  clean.tags = TRUE,
  metadata = data.frame(),
  frame.rate = NA,
  encoding = "auto",
  ...
) {
  # Trim empty lines at the beginning and end of the file
  subs <- x
  i <- 1
  while (subs[[i]] == "") {
    i <- i + 1
  }
  j <- length(subs)
  while (subs[[j]] == "") {
    j <- j - 1
  }
  subs <- subs[seq.int(i, j)]

  format <- match.arg(
    format,
    choices = c(
      "srt",
      "subrip",
      "sub",
      "subviewer",
      "microdvd",
      "ssa",
      "ass",
      "substation",
      "vtt",
      "webvtt",
      "auto"
    ),
    several.ok = FALSE
  )

  # .sub can be microdvd or subviewer
  #This is a very light test to solve the .sub extension
  if (format == "sub") {
    if (startsWith(subs[[1]], prefix = "{")) {
      format <- "microdvd"
    } else {
      format <- "subviewer"
    }
  }

  if (format %in% c("srt", "subrip")) {
    # Get rid off multiple empty new lines
    empty_li <- which(subs == "")
    to_del <- empty_li[c(empty_li, -1) == c(-1, empty_li + 1)]
    if (length(to_del) > 0) {
      subs <- subs[-to_del]
    }

    subs.newlines <- c(0, which(subs == ""))
    subs.n.li <- subs.newlines + 1
    subs.time.li <- subs.newlines + 2
    subs.txt.li <- mapply(
      seq,
      from = subs.time.li + 1,
      to = c(subs.newlines[-1], length(subs) + 1) - 1
    )

    subs.n <- subs[subs.n.li]
    subs.time <- subs[subs.time.li]
    subs.txt <- sapply(subs.txt.li, function(x) paste(subs[x], collapse = " "))

    subs.time <- strsplit(subs.time, split = " --> ")
    timecode.in <- sapply(subs.time, function(x) x[[1]])
    timecode.out <- sapply(subs.time, function(x) x[[2]])
  }

  if (format %in% c("ssa", "ass", "substation")) {
    subs.events.h <- subs[grep("^Format:.*Start,.*Text", subs)]

    subs.events.li <- grep("^Dialogue:", subs)
    subs.events <- subs[subs.events.li]

    comma.pos <- gregexpr(",", subs.events)
    comma.min <- min(lengths(comma.pos))

    subs.txt <- substr(
      subs.events,
      start = comma.pos[[1]][comma.min] + 1,
      stop = 10^5L
    )

    subs.events <- substr(
      subs.events,
      start = 1,
      stop = comma.pos[[1]][comma.min]
    )
    subs.events <- paste(subs.events, collapse = "\n")
    subs.events <- paste(subs.events.h, subs.events, sep = "\n")
    subs.events <- read.csv(
      textConnection(subs.events),
      header = TRUE,
      sep = ",",
      quote = "",
      stringsAsFactors = FALSE,
      fill = TRUE
    )

    subs.txt <- gsub("\\\\[Nn]", " ", subs.txt)
    timecode.in <- subs.events[, "Start"]
    timecode.out <- subs.events[, "End"]
    subs.n <- order(timecode.in)
  }

  if (format == "microdvd") {
    if (is.na(frame.rate)) {
      if (grepl("\\{[10]\\}\\{[10]\\}([0-9\\.]+)$", subs[1])) {
        frame.rate <- as.numeric(gsub("(\\{.+\\})+", "", subs[1]))
        subs <- subs[-1]
      } else {
        frame.rate <- 23.976
      }
    }

    subs <- gsub("\\|(\\{.+\\})+", "|", subs)

    timecode <- regmatches(subs, regexpr("^\\{[0-9]+\\}\\{[0-9]+\\}", subs))
    timecode <- strsplit(timecode, split = "\\}\\{")
    timecode <- lapply(timecode, gsub, pattern = "[\\{\\}]", replacement = "")
    timecode.in <- sapply(timecode, function(x) x[1])
    timecode.in <- as.numeric(timecode.in) / frame.rate
    timecode.in <- as.character(hms::hms(seconds = timecode.in))
    timecode.out <- sapply(timecode, function(x) x[2])
    timecode.out <- as.numeric(timecode.out) / frame.rate
    timecode.out <- as.character(hms::hms(seconds = timecode.out))

    subs.txt <- gsub("(\\{.+\\})+", "", subs)
    subs.txt <- gsub("\\|", " ", subs.txt)

    subs.n <- order(timecode.in)
  }

  if (format == "subviewer") {
    subs <- subs[!grepl("^\\[.+\\]", subs)]
    subs <- subs[seq(min(which(subs != "")), max(which(subs != "")))]
    subs.newlines <- c(0, which(subs == ""))
    subs.time.li <- subs.newlines + 1
    subs.txt.li <- subs.newlines + 2

    subs.txt <- subs[subs.txt.li]
    subs.txt <- gsub("\\[br\\]", " ", subs.txt)

    subs.time <- subs[subs.time.li]
    subs.time <- strsplit(subs.time, split = ",")
    timecode.in <- sapply(subs.time, function(x) x[[1]])
    timecode.out <- sapply(subs.time, function(x) x[[2]])

    subs.n <- order(timecode.in)
  }

  if (format %in% c("webvtt", "vtt")) {
    if (!grepl("^WEBVTT", subs[[1]])) {
      stop("Invalid WebVTT format")
    }

    subs <- subs[-1] # Remove header
    subs <- c(subs, "") # Add a virtual last line

    subs.newlines <- c(0, which(subs == ""))
    blocks.delim <- cbind(
      subs.newlines[-length(subs.newlines)] + 1,
      subs.newlines[-1] - 1
    )
    blocks <- apply(blocks.delim, 1, function(x) subs[x[[1]]:x[[2]]])

    blocks <- blocks[sapply(blocks, function(x) {
      any(sapply(x, function(y) .test_cuetiming(y)))
    })]

    blocks.tc.pos <- sapply(blocks, function(x) {
      which(sapply(x, function(y) .test_cuetiming(y)))
    })

    subs.n <- mapply(
      function(x, y, z) ifelse(y == 1, z, x[[1]]),
      x = blocks,
      y = blocks.tc.pos,
      z = seq(1, length(blocks))
    ) # If cues are not labelled, they are numbered

    #Extract and clean timing
    subs.time <- mapply(function(x, y) x[y], x = blocks, y = blocks.tc.pos)
    subs.time <- regmatches(
      subs.time,
      regexpr(
        "^(?:[0-9]{2, }:)?[0-9]{2}:[0-9]{2}.[0-9]{3}[[:blank:]]+-->[[:blank:]]+(?:[0-9]{2, }:)?[0-9]{2}:[0-9]{2}.[0-9]{3}",
        subs.time
      )
    )

    # Extract text content and deal with escape sequences
    subs.txt <- mapply(
      function(x, y) paste(x[(y + 1):length(x)], collapse = " "),
      x = blocks,
      y = blocks.tc.pos
    )
    subs.txt <- gsub("&amp;", "&", subs.txt)
    subs.txt <- gsub("&lt;", "<", subs.txt)
    subs.txt <- gsub("&gt;", ">", subs.txt)
    subs.txt <- gsub("&lrm;|&rlm;", "", subs.txt) # Left-to-right mark and right-to-left mark are not supported
    subs.txt <- gsub("&nbsp;", " ", subs.txt)

    subs.time <- strsplit(subs.time, split = "[[:blank:]]+-->[[:blank:]]+")
    timecode.in <- sapply(subs.time, function(x) x[[1]])
    timecode.out <- sapply(subs.time, function(x) x[[2]])
  }

  res <- subtitles(
    text = subs.txt,
    timecode.in = timecode.in,
    timecode.out = timecode.out,
    id = subs.n,
    metadata = metadata
  )

  if (clean.tags) {
    res <- clean_tags(res, format = format)
  }

  return(res)
}

#' @rdname as_subtitle
#' @export
as_subtitle.character <- as_subtitle.default

#' Read subtitles
#'
#' Reads subtitles from a file.
#'
#' @param file the name of the file which the subtitles are to be read from.
#' If it does not contain an absolute path, the file name is relative to the current working directory.
#' @inheritParams as_subtitle
#'
#' @details The support of WebVTT is basic and experimental.
#'
#' @returns
#' An object of class \code{subtitles} (see \code{\link{subtitles}}).
#'
#' @examples
#'
#' # read a SubRip file
#' f <- system.file("extdata", "ex_subrip.srt", package = "subtools")
#' f <- system.file("extdata", "ex_webvtt.vtt", package = "subtools")
#' read_subtitles(f)
#'
#' @export
read_subtitles <- function(
  file,
  format = "auto",
  clean.tags = TRUE,
  metadata = data.frame(),
  frame.rate = NA,
  encoding = "auto"
) {
  stopifnot("file not found" = file.exists(file))
  if (format == "auto") {
    format <- .extr_extension(file)
  }

  if (encoding == "auto") {
    encoding <- readr::guess_encoding(file)$encoding[1]
  }

  if (encoding == "ASCII") {
    subs <- readLines(file)
  } else {
    con <- file(file, encoding = encoding)
    subs <- readLines(con, warn = FALSE, encoding = encoding)
    close(con)
  }

  as_subtitle(
    x = subs,
    format = format,
    clean.tags = clean.tags,
    metadata = metadata,
    frame.rate = frame.rate,
    encoding = encoding
  )
}


#' Create a \code{subtitles} object
#'
#' A \code{subtitles} is a special form of \code{tibble}.
#'
#' @param text a character vector of subtitles text content.
#' @param timecode.in a character vector giving the time that the subtitles appear on the screen.
#' The format must be "HH:MM:SS.mS".
#' @param timecode.out a character vector giving the time that the subtitles disappear.
#' The format must be "HH:MM:SS.mS".
#' @param id a vector of IDs for subtitles.
#' If not provided it is generated automatically from \code{timecode.in} order.
#' @param metadata a one-row dataframe or tibble, or any object that can be coerced
#' into a one-row tibble by \code{link[tibble]{as_tibble}}.
#'
#' @returns a \code{subtitles} object i.e. a \code{tibble} with at least 4 columns containing IDs,
#' timecodes and text of the subtitles and optionally metadata in extra columns.
#'
#' @export
#'
subtitles <- function(
  text,
  timecode.in,
  timecode.out,
  id,
  metadata = data.frame()
) {
  if (missing(id)) {
    warning("ID missing: A numerical sequence was used instead.")
    id <- order(timecode.in)
  }
  subtitles <- tibble::tibble(
    ID = as.character(id),
    Timecode_in = as.character(timecode.in),
    Timecode_out = as.character(timecode.out),
    Text_content = as.character(text)
  )
  subtitles <- tibble::remove_rownames(subtitles)
  # subtitles$Timecode_in <- sapply(subtitles$Timecode_in, .format_subtime)
  # subtitles$Timecode_out <- sapply(subtitles$Timecode_out, .format_subtime)

  subtitles$Timecode_in <- .format_subtime(subtitles$Timecode_in)
  subtitles$Timecode_out <- .format_subtime(subtitles$Timecode_out)

  if (nrow(metadata) == 1) {
    metadata <- tibble::as_tibble(metadata)
    metadata <- metadata[rep(1, nrow(subtitles)), ]
    subtitles <- dplyr::bind_cols(subtitles, metadata)
  } else if (nrow(metadata) == nrow(subtitles)) {
    metadata <- tibble::as_tibble(metadata)
    subtitles <- dplyr::bind_cols(subtitles, metadata)
  }

  class(subtitles) <- c("subtitles", class(subtitles))
  .assert_subtitles(subtitles)
  return(subtitles)
}


#' Print method for multisubtitles
#'
#' @param x a \code{multisubtitles} object.
#' @param printlen the maximum number of subtitles to print.
#' @param ... further arguments passed to or from other methods.
#'
#' @returns Called for its side effects (printing to the console). Returns \code{NULL} invisibly.
#'
#' @examples
#' f <- system.file("extdata", "ex_subrip.srt", package = "subtools")
#' s <- read_subtitles(f)
#' bind_subtitles(s, s, collapse = FALSE)
#'
#' @export
print.multisubtitles <- function(x, printlen = 10L, ...) {
  cat("A multisubtitles object with", length(x), "elements\n")
  toprint <- ifelse(length(x) > printlen, printlen, length(x))
  for (i in 1:toprint) {
    cat("subtitles object [[", i, "]]\n", sep = "")
    print(x[[i]])
    cat("\n\n")
  }
  if (length(x) > printlen) {
    cat("...and ", printlen - length(x), "other subtitles elements")
  }
}


#' Get basic informations for subtitle objects
#'
#' @param x a \code{subtitles} or \code{multisubtitles} object.
#'
#' @returns Called for its side effects (printing to the console). Returns \code{NULL} invisibly.
#'
#' @examples
#' s <- read_subtitles(
#'   system.file("extdata", "ex_subrip.srt", package = "subtools")
#' )
#' get_subtitles_info(s)
#'
#' @export
get_subtitles_info <- function(x) {
  if (is(x, "subtitles")) {
    cat("subtitles object")
    cat("\n  Text lines:", nrow(x))
    cat(
      "\n  Duration:",
      as.character(hms::as_hms(x$Timecode_out[[nrow(x)]] - x$Timecode_in[[1]]))
    )

    metadata <- setdiff(
      names(x),
      c("ID", "Timecode_in", "Timecode_out", "Text_content")
    )
    cat("\n  Metadata:", length(metadata))
    if (length(metadata) > 0) cat("tags:", paste0(metadata, collapse = ", "))
  }

  if (is(x, "multisubtitles")) {
    cat("multisubtitles object with", length(x), "subtitles.")
  }
}

Try the subtools package in your browser

Any scripts or data that you put into this service are public.

subtools documentation built on March 24, 2026, 5:07 p.m.