hour_secs <- 60 * 60
min_secs <- 60
#' Parse an srt file and return a list of subs
#'
#' Parse an srt file and return a list of subs
#'
#' @param f filename/path of the .srt (or connection: it's passed to
#' readLine)
#' @param comment lines starting with this char will be ignored
#' @param set_id_as_prog if TRUE, fix the id of the srt (simple progressive,
#' independent from what parsed)
#' @export
read_srt <- function(f = NULL, comment = '##', set_id_as_prog = TRUE){
lines <- readLines(f)
## remove comments
comment_lines <- grepl(sprintf("^%s", comment), lines)
lines <- lines[!comment_lines]
## splitta sulla base di righe vuote (almeno una necessaria
## tra uno spezzone e l'altro
srt <- split(lines, cumsum(lines == ''))
## now remove the first element if it's a "" and remove (all blank chunks)
srt <- lapply(srt, function(x) if (x[1] == '') x[-1] else x)
srt <- Filter(function(x) length(x) > 0L, srt)
parsed_srt <- lapply(srt, srt_parser)
ids <- unlist(lapply(parsed_srt, function(x) x$id))
names(parsed_srt) <- ids
## check che id e progressivo coincidano
## -------------------------------------
## id è quello che si trova dentro
## prog è il progressivo post parsing
## auspicabilmente dovrebbero coincidere a fine file se no qualcosa
## è andato storto (manca una linea da qualche parte)
progs <- seq(from = min(as.integer(ids), na.rm = TRUE),
length = length(parsed_srt))
if (!all(as.integer(ids) == progs)){
msg <- c("problemi nel parsing: id del srt origine e\n",
"prog del parsato non coincidono, probabilmente manca\n",
"una linea bianca necessaria tra due sub,\n",
"oppure l'srt è un collage di più file che partono da 1.\n",
"Il confronto tra i due: ")
check <- data.frame('id' = ids, 'prog' = progs)
check <- check[with(check, id != prog), ]
rownames(check) <- NULL
warning(msg)
print(check)
}
## check/validazione del contenuto del srt parsato
lapply(parsed_srt, srt_validator)
## per fixare i collage (o fare quello che normalmente fanno i subeditor)
if (set_id_as_prog) {
id_to_prog <- function(s, p) {s$id <- p; s}
parsed_srt <- Map(id_to_prog, parsed_srt, as.list(progs))
names(parsed_srt) <- progs
}
## return
parsed_srt
}
srt_time_ptrn <- "^(\\d\\d):(\\d\\d):(\\d\\d),(\\d\\d\\d).+(\\d\\d):(\\d\\d):(\\d\\d),(\\d\\d\\d)$"
srt_parser <- function(x){
## dati originali
id_pos <- 1L
times_pos <- 2L
text_pos <- seq_along(x) %without% 1:2
id <- x[id_pos]
times <- x[times_pos]
text <- x[text_pos]
## elaborazioni
times <- gsub(" ", "", times) # togliere eventuali spazi bianghi
## estrazione tempo inizio
start_hour <- as.integer(gsub(srt_time_ptrn, "\\1", times))
start_min <- as.integer(gsub(srt_time_ptrn, "\\2", times))
start_sec <- as.integer(gsub(srt_time_ptrn, "\\3", times))
start_msec <- as.integer(gsub(srt_time_ptrn, "\\4", times))
## estrazione tempo fine
stop_hour <- as.integer(gsub(srt_time_ptrn, "\\5", times))
stop_min <- as.integer(gsub(srt_time_ptrn, "\\6", times))
stop_sec <- as.integer(gsub(srt_time_ptrn, "\\7", times))
stop_msec <- as.integer(gsub(srt_time_ptrn, "\\8", times))
## metto a posto il pattern che si sa mai ...
p <- "%02.0f:%02.0f:%02.0f,%03.0f --> %02.0f:%02.0f:%02.0f,%03.0f"
times <- sprintf(p,
start_hour, start_min, start_sec, start_msec,
stop_hour, stop_min, stop_sec, stop_msec)
## calcolo del tempo in secondi complessivi
start_secs <-
start_hour * hour_secs +
start_min * min_secs +
start_sec +
start_msec/1000
stop_secs <-
stop_hour * hour_secs +
stop_min * min_secs +
stop_sec +
stop_msec/1000
## output
list(id = id,
times = times,
text = text,
## processed start time
start_hour = start_hour,
start_min = start_min,
start_sec = start_sec,
start_msec = start_msec,
## processed stop time
stop_hour = stop_hour,
stop_min = stop_min,
stop_sec = stop_sec,
stop_msec = stop_msec,
## processed msecs
start_secs = start_secs,
stop_secs = stop_secs
)
}
srt_validator <- function(s){
valid_id(s$id)
valid_times(s)
valid_text(s$text)
}
valid_id <- function(x) {
ok <- !is.na(as.integer(x))
if (!ok) stop("not valid id for: ", x)
}
valid_text <- function(x) {
ok <- is.character(x) && length(x) > 0L
if (!ok) stop("there are subs with no text: ", x)
}
valid_times <- function(x){
## questi sono già interi, testare solo che siano nel range giusto
valid_hour(x$start_hour)
valid_min(x$start_min)
valid_sec(x$start_sec)
valid_msec(x$start_msec)
valid_hour(x$stop_hour)
valid_min(x$stop_min)
valid_sec(x$stop_sec)
valid_msec(x$stop_msec)
}
valid_hour <- function(x) {
ok <- in_range(x, c(0,24)) #qualche dubbio sull'estremo superiore per srt
if (!ok) stop("not valid hour for: ", x)
}
valid_min <- function(x) {
ok <- in_range(x, c(0,59))
if (!ok) stop("not valid min for: ", x)
}
valid_sec <- function(x) {
ok <- in_range(x, c(0,59))
if (!ok) stop("not valid sec for: ", x)
}
valid_msec <- function(x) {
ok <- in_range(x, c(0,999))
if (!ok) stop("not valid sec for: ", x)
}
in_range <- function(t, r) {
t %in% seq(from = r[1], to = r[2])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.