R/clean_subtitles.R

Defines functions clean_patterns clean_captions clean_tags

Documented in clean_captions clean_patterns clean_tags

#' Clean subtitles
#'
#' Functions to clean subtitles. \code{clean_tags} cleans formatting tags.
#' \code{clean_captions} cleans close captions, i.e all text enclosed in parentheses or squared brackets.
#' \code{clean_patterns} provides a more general and flexible cleaning based on regular expressions.
#'
#' @param x a \code{subtitles} or \code{multisubtitles} object.
#' @param format the original format of the \code{subtitles} objects.
#' @param pattern a character string containing a regular expression to be matched and cleaned.
#' @param clean.empty logical. Should empty remaining lines ("") deleted after cleaning.
#'
#' @returns A \code{subtitles} or \code{multisubtitles} object.
#' @export
#' @rdname clean
clean_tags <- function(x, format = "srt", clean.empty = TRUE) {
  if (!(is(x, "subtitles") || is(x, "multisubtitles"))) {
    stop("x must be a 'subtitles' or a 'multisubtitles' object.")
  }

  if (is(x, "multisubtitles")) {
    x <- lapply(x, clean_tags, format = format, clean.empty = clean.empty)
    class(x) <- "multisubtitles"
  } else {
    .assert_subtitles(x)
    format <- match.arg(
      format,
      choices = c(
        "srt",
        "subrip",
        "sub",
        "subviewer",
        "microdvd",
        "ssa",
        "ass",
        "substation",
        "vtt",
        "webvtt",
        "all"
      ),
      several.ok = FALSE
    )

    if (format %in% c("srt", "subrip", "vtt", "webvtt", "all")) {
      x$Text_content <- gsub("<.+?>", "", x$Text_content)
    }

    if (format %in% c("ass", "ssa", "substation", "all")) {
      x$Text_content <- gsub("\\{\\\\.+?\\}", "", x$Text_content)
    }

    if (clean.empty) {
      x <- x[x$Text_content != "", ]
    }
  }
  return(x)
}


#' @rdname clean
#' @export
clean_captions <- function(x, clean.empty = TRUE) {
  if (!(is(x, "subtitles") || is(x, "multisubtitles"))) {
    stop("x must be a 'subtitles' or a 'multisubtitles' object.")
  }

  if (is(x, "multisubtitles")) {
    x <- lapply(x, clean_captions, clean.empty = clean.empty)
    class(x) <- "multisubtitles"
  } else {
    .assert_subtitles(x)

    x$Text_content <- gsub("\\(.+?\\)", "", x$Text_content)
    x$Text_content <- gsub("\\[.+?\\]", "", x$Text_content)

    if (clean.empty) {
      x <- x[x$Text_content != "", ]
    }
  }
  return(x)
}


#' @rdname clean
#' @export
clean_patterns <- function(x, pattern, clean.empty = TRUE) {
  if (!(is(x, "subtitles") || is(x, "multisubtitles"))) {
    stop("x must be a 'subtitles' or a 'multisubtitles' object.")
  }

  if (is(x, "multisubtitles")) {
    x <- lapply(x, clean_patterns, pattern = pattern, clean.empty = clean.empty)
    class(x) <- "multisubtitles"
  } else {
    .assert_subtitles(x)

    x$Text_content <- gsub(pattern, "", x$Text_content)

    if (clean.empty) {
      x <- x[x$Text_content != "", ]
    }
  }
  return(x)
}

Try the subtools package in your browser

Any scripts or data that you put into this service are public.

subtools documentation built on March 24, 2026, 5:07 p.m.