lbav: AV YouTube subs utilities

Documented in read_srt

hour_secs <- 60 * 60
min_secs  <- 60    

#' Parse an srt file and return a list of subs
#'
#' Parse an srt file and return a list of subs
#' 
#' @param f filename/path of the .srt (or connection: it's passed to
#'     readLine)
#' @param comment lines starting with this char will be ignored
#' @param set_id_as_prog if TRUE, fix the id of the srt (simple progressive,
#'     independent from what parsed)
#' @export
read_srt <- function(f = NULL, comment = '##', set_id_as_prog = TRUE){
    
    lines <- readLines(f)
    
    ## remove comments
    comment_lines <- grepl(sprintf("^%s", comment), lines)
    lines <- lines[!comment_lines]

    ## splitta sulla base di righe vuote (almeno una necessaria
    ## tra uno spezzone e l'altro
    srt <- split(lines, cumsum(lines == ''))

    ## now remove the first element if it's a "" and remove (all blank chunks)
    srt <- lapply(srt, function(x) if (x[1] == '') x[-1] else x)
    srt <- Filter(function(x) length(x) > 0L, srt)

    parsed_srt <- lapply(srt, srt_parser)
    ids <- unlist(lapply(parsed_srt, function(x) x$id))
    names(parsed_srt) <- ids
    ## check che id e progressivo coincidano
    ## -------------------------------------
    ## id è quello che si trova dentro
    ## prog è il progressivo post parsing
    ## auspicabilmente dovrebbero coincidere a fine file se no qualcosa
    ## è andato storto (manca una linea da qualche parte)
    progs <- seq(from = min(as.integer(ids), na.rm = TRUE),
                 length = length(parsed_srt))
    if (!all(as.integer(ids) == progs)){
        msg <- c("problemi nel parsing: id del srt origine e\n",
                 "prog del parsato non coincidono, probabilmente manca\n",
                 "una linea bianca necessaria tra due sub,\n",
                 "oppure l'srt è un collage di più file che partono da 1.\n",
                 "Il confronto tra i due: ")
        check <- data.frame('id' = ids, 'prog'  = progs)
        check <- check[with(check, id != prog), ]
        rownames(check) <- NULL
        warning(msg)
        print(check)
    }
    ## check/validazione del contenuto del srt parsato
    lapply(parsed_srt, srt_validator)
    ## per fixare i collage (o fare quello che normalmente fanno i subeditor)
    if (set_id_as_prog) {
        id_to_prog <- function(s, p) {s$id <- p; s}
        parsed_srt <- Map(id_to_prog, parsed_srt, as.list(progs))
        names(parsed_srt) <- progs
    }
    ## return
    parsed_srt
}

srt_time_ptrn <- "^(\\d\\d):(\\d\\d):(\\d\\d),(\\d\\d\\d).+(\\d\\d):(\\d\\d):(\\d\\d),(\\d\\d\\d)$"


srt_parser <- function(x){

    ## dati originali
    id_pos    <- 1L
    times_pos <- 2L
    text_pos  <- seq_along(x) %without% 1:2
    id    <- x[id_pos]
    times <- x[times_pos]
    text  <- x[text_pos]

    ## elaborazioni
    times <- gsub(" ", "", times) # togliere eventuali spazi bianghi

    ## estrazione tempo inizio 
    start_hour      <- as.integer(gsub(srt_time_ptrn, "\\1", times))
    start_min       <- as.integer(gsub(srt_time_ptrn, "\\2", times))
    start_sec       <- as.integer(gsub(srt_time_ptrn, "\\3", times))
    start_msec      <- as.integer(gsub(srt_time_ptrn, "\\4", times))
    ## estrazione tempo fine 
    stop_hour      <- as.integer(gsub(srt_time_ptrn, "\\5", times))
    stop_min       <- as.integer(gsub(srt_time_ptrn, "\\6", times))
    stop_sec       <- as.integer(gsub(srt_time_ptrn, "\\7", times))
    stop_msec      <- as.integer(gsub(srt_time_ptrn, "\\8", times))
    ## metto a posto il pattern che si sa mai ...
    p <- "%02.0f:%02.0f:%02.0f,%03.0f --> %02.0f:%02.0f:%02.0f,%03.0f"
    times <- sprintf(p,
                     start_hour, start_min, start_sec, start_msec,
                     stop_hour, stop_min, stop_sec, stop_msec) 
    ## calcolo del tempo in secondi complessivi
    start_secs <-
        start_hour * hour_secs +
        start_min  * min_secs +
        start_sec             +
        start_msec/1000
    stop_secs <-
        stop_hour * hour_secs +
        stop_min  * min_secs +
        stop_sec             +
        stop_msec/1000
    
    ## output
    list(id    = id,
         times = times,
         text  = text,
         ## processed start time
         start_hour = start_hour,
         start_min  = start_min, 
         start_sec  = start_sec, 
         start_msec = start_msec,
         ## processed stop time
         stop_hour = stop_hour,
         stop_min  = stop_min, 
         stop_sec  = stop_sec, 
         stop_msec = stop_msec,
         ## processed msecs
         start_secs = start_secs,
         stop_secs = stop_secs
         )
}


srt_validator <- function(s){
    valid_id(s$id)
    valid_times(s)
    valid_text(s$text)
}

valid_id <- function(x) {
    ok <- !is.na(as.integer(x))
    if (!ok) stop("not valid id for: ", x)
}

valid_text <- function(x) {
    ok <- is.character(x) && length(x) > 0L
    if (!ok) stop("there are subs with no text: ", x)
}

valid_times <- function(x){
    ## questi sono già interi, testare solo che siano nel range giusto
    valid_hour(x$start_hour)
    valid_min(x$start_min)
    valid_sec(x$start_sec)
    valid_msec(x$start_msec)
    valid_hour(x$stop_hour)
    valid_min(x$stop_min)
    valid_sec(x$stop_sec)
    valid_msec(x$stop_msec)
}

valid_hour <- function(x) {
    ok <- in_range(x, c(0,24)) #qualche dubbio sull'estremo superiore per srt
    if (!ok) stop("not valid hour for: ", x)
}

valid_min <- function(x) {
    ok <- in_range(x, c(0,59))
    if (!ok) stop("not valid min for: ", x)
}

valid_sec <- function(x) {
    ok <- in_range(x, c(0,59))
    if (!ok) stop("not valid sec for: ", x)
}

valid_msec <- function(x) {
    ok <- in_range(x, c(0,999))
    if (!ok) stop("not valid sec for: ", x)
}

in_range <- function(t, r) {
    t %in% seq(from = r[1], to = r[2])
}