rlang-loophole: Manage traces, timeseries, and classification for timing data extracted with Loophole

Documented in calculatePerformanceCross dtwDistParallel dtwDistParallel executeCrossValidation parseDatabaseFilename parseDatabaseFilename printConfigData printConfigData recognitionRate recognitionRate runCrossValidation tracesMatch tracesMatch

#' runCrossValidation
#'
#' Runs an N-fold cross validation with m test samples per round for all possible combinations of parameters
#' @param inPath Path containing timeseries databases.
#' @param windowTypes Window types.
#' @param stepPatterns Step patterns.
#' @param windowSizes Window sizes.
#' @param P Time averaging parameter.
#' @param maxLength Maximum trace length
#' @param sampFun Sampling function.
#' @param N Number of cross validation rounds - one per reference sample.
#' @param m Number of test samples per round.
#' @param logFile Log file, defaults to NA.
#' @return results Summary table with performance for each combination of
#' parameters.
#' @export
#' @examples
#' runCrossValidation('dbs/')

runCrossValidation <- function(inPath,
                               windowTypes = c('sakoechiba', 'slantedband', 'itakura'),
                               stepPatterns = c('symmetric1', 'symmetric2', 'asymmetric'),
                               windowSizes = c(1, 5, 10, 20, 50, 100),
                               P = c(1, 2, 5, 10, 20, 50),
                               maxLength = c(1000, 2000, 3000, 4000),
                               sampFun = c('sum', 'max'),
                               N, m,
                               logFile = NA){
  
  results <- data.frame(maxLength = NULL, P = NULL, sampFun = NULL, windowType = NULL, stepPattern = NULL,
                        windowSize = NULL, tolerance = NULL, rate = NULL)
  
  parameters <- as.data.frame(expand.grid(windowType = windowTypes,
                                             stepPattern = stepPatterns,
                                             windowSize = windowSizes,
                                             P = P, maxLength = maxLength, sampFun = sampFun))

 # parameters <- parameters[-which(parameters$windowType == 'itakura' & parameters$windowSize != 100),]

  for (p in 1:nrow(parameters)){
    conf <- parameters[p,]
    perf <- executeCrossValidation(inPath, conf, N, m, logFile)

    for (tol in names(perf)){
      results <- rbind(results, data.frame(maxLength = conf$maxLength, P = conf$P, sampFun = conf$sampFun,
                                           windowType = conf$windowType, stepPattern = conf$stepPattern,
                                           windowSize = conf$windowSize, tolerance = as.numeric(tol),
                                           rate = perf[tol]))
    }
  }
  return(results)
}


#' executeCrossValidation
#'
#' Executes N-fold cross validation with m tests per round for a fixed set of DTW parameters
#' @param conf DTW parameters including step pattern and window type & size.
#' @param N Number of rounds in the cross validation.
#' @param m Number of test samples per round.
#' @param logFile Name of log file, defaults to NA
#' @return perf Performance vector averaged over all m*N classifications.

executeCrossValidation <- function(inPath, conf, N, m, logFile=NA) {

    # Copy all output into a log file
    if (!is.na(logFile)) {
     #   sink(logFile, split = TRUE)
    }

    # Get dbs of timeseries
    dbs <- list.files(inPath, pattern=paste0("\\.db$"))
    
    # Creates directory for temp cross distance matrix
    suppressWarnings(dir.create(paste(inPath, "crossMatrices", sep="")))
    
    # For each different type of time series
    for (dbname in dbs) {
     
        # Extract db parameters
        tsParameters <- parseDatabaseFilename(dbname)

        # Load db
        print(paste("Loading ", inPath, dbname, sep=""), quote = F)
        db <- readRDS(paste(inPath, dbname, sep=""))

        # Print parameters
        print(paste("Conf:", printConfigData(conf)), quote = F)

        # Calculates rounds of DTW with given config
        perf <- calculatePerformanceCross(db, conf, N, m, paste(inPath, "crossMatrices/", sep=""))
        print(perf)
        # Print perforamnce
        print(paste("Performance: ", paste(perf, collapse=" ")), quote = F)
        print("=============", quote = F)
    }
    return(perf)

}

#' parseDatabaseFilename
#'
#' Extract parameters from db filename
#' @param name Filename.
#' @return Named list of parameters.
#' @export
#' @examples
#' parseDatabaseFilename("ts_1000_20_sum.db")
parseDatabaseFilename <- function(name) {

    x <- stringr::str_match_all(name, "ts_([0-9]+)_([0-9]+)_([a-zA-Z]+)\\.db")
    if (length(x) != 0) {
        return(list(maxLength=x[[1]][1,2], P=x[[1]][1,3], sampFun=x[[1]][1,4]))
    }

}

#' printConfigData
#'
#' Returns formatted config data string
#' @param conf Config data.frame.
#' @return String.
#' @export
printConfigData <- function(conf) {

    return(paste0("window - ", conf$windowType,
                 " | step - ", conf$stepPattern,
                 " | size - ", conf$windowSize,
                 " | P - ", conf$P,
                 " | length - ", conf$maxLength))

}

#' calculatePerformanceCross
#'
#' Calculates aprox. matching rate for given a database and configuration
#' @param db Database of timeseries.
#' @param conf Configuration for DTW algorithm as a data.frame row.
#' @param N Number of rounds, with one different reference set each round.
#' @param m Number of test sets per round.
#' @param keepMatrices Save intermediate cross DTW matrices in files. Defaults to NA. 
#' @return Average matching rate after N rounds.
#' @export
#' @examples
#' calculatePerformanceCross(db, conf, keepMatrices="dbs/crossMatrices/")

calculatePerformanceCross <- function(db, conf, N=10, m=3, keepMatrices=NA) {

    tmp <- names(db)[[1]]
    pattern <- stringr::str_replace(tmp, "_[0-9]+\\.json$", "")
    tmp <- names(db[grepl(pattern, names(db))])
    sets <- as.numeric(stringr::str_match(tmp, "_([0-9]+)\\.json")[,2])

    # Select N references
    set.seed(42) # use same samples for every calculations
    randSamples <- sample(sets, N, replace=FALSE)

    # Performance per round (2 times beacuse asymetry)
    performance <- matrix(0, ncol=5, nrow=N, dimnames=list(NULL, c("1","3","5","10","30")))

    # Do N rounds
    for (k in 1:N) {

        print(paste("Round: ", k, "/", N, sep=""))
        
        r <- randSamples[k] # reference sample
        ref <- db[grepl(paste("_", r, "\\.json$", sep=""), names(db))]
        
        set.seed(k) # change test sets each round
        tSamples <- sample(sets[sets!=sets[k]], m, replace = FALSE) # test samples

        for (t in tSamples){
          # Database timeseries names have the format 'webpage_k.json' (k in [0:14])
          test <- db[grepl(paste("_", t,"\\.json$", sep=""), names(db))]

          # Compute cross DTW matrix for each configuration and pair of test/ref
          crossDTWmatrix <- dtwDistParallel(test, ref, conf, paste(r,t,sep="-"), output=keepMatrices)

          # Calculate matching rate 
          performance[k,] <- performance[k,] + recognitionRate(crossDTWmatrix)
        }
        performance[k,] <- performance[k,]/m # performance averaged over the m tests
    }

    return(colMeans(performance)) # performance averaged over the N rounds/refs

}

#' tracesMatch
#'
#' Returns true if trace's names are from same page
#' @param a Query's trace name
#' @param b List of names ordered by proximity.
#' @param tolerance Tolerance. Defaults to 1.
#' @return TRUE if query's name is in "tolerance" firsts sublist.
#' @export
tracesMatch <- function(query, response, tolerance=1) {
    query <- stringr::str_replace(query, "_[0-9]+\\.json$", "")
    # optional, remove TLDs with 'tldextract'
    for (j in 1:tolerance) {
        response[j] <- stringr::str_replace(response[j], "_[0-9]+\\.json$", "")
        if (query == response[j]) return(TRUE)
    }
    return (FALSE)
}

#' recognitionRate
#'
#' Given a cross distance matrix returns recognition rate
#' @param M Cross distance matrix.
#' @param tolerance Tolerance. Defaults to 1.
#' @return Recognition rate.
#' @export
recognitionRate <- function(M) {

    count <- rep(0,5)
    i <- 1

    for (t in c("1","3","5","10","30")) {

        tmp <- sapply(rownames(M), function(q) tracesMatch(q, colnames(M)[order(M[q,])], as.numeric(t)))
        count[i] <- length(tmp[tmp == TRUE])

        tmp <- sapply(colnames(M), function(q) tracesMatch(q, rownames(M)[order(M[,q])], as.numeric(t)))
        count[i] <- count[i] + length(tmp[tmp == TRUE])

        count[i] <- count[i] / (dim(M)[1] * 2) # count twice because of asymetry

        i <- i + 1

    }

    return(count)

}

#' dtwDistParallel
#'
#' Calculates cross DTW distance matrix for X and Y (lists of timeseries)
#' @param mx List of timeseries.
#' @param my List of timeseries.
#' @param conf Parameters for 'dtw' function.
#' @param output Output folder to save the cross DTW distance matrix. Defaults to NA.
#' @return Returns the cross DTW distance matrix.
#' @export
#' @examples
#' dtwDistParallel()

dtwDistParallel <- function(mx, my, conf, tag, output=NA) {

    # Try to load previously calculated matrix
    if (!is.na(output)) {
        name <- paste("crossDTWmatrix", conf$maxLength, conf$P, conf$windowType,
                      conf$stepPattern, conf$windowSize, tag, sep="_")
        name <- paste(output, name, ".dat", sep="")

        if (file.exists(name)) {
            print("Loading precalculated matrix...")
            return(readRDS(file=name))
        }

    }

    # Prepare matrix
    print("Start matrix computation (may take a long time)")
    ptm <- proc.time()

    # Register parallel support
    doParallel::registerDoParallel(cores=parallel::detectCores())

    # Calculate DTW
    res <- foreach::foreach(e=1:(length(mx)*length(my)), .inorder=TRUE) %dopar% {

        i <- ((e-1) %% length(my)) + 1
        j <- floor(e / length(my)) + 1

        tryCatch({
            dtw::dtw(unlist(mx[[i]]), unlist(my[[j]]), dist.method="Euclidean",
                distance.only=TRUE, keep.internals=FALSE,
                window.type=as.character(conf$windowType),
                step=get(as.character(conf$stepPattern),envir=getNamespace("dtw")),
                window.size=conf$windowSize)$dist
        }, error = function(e) {
            return(Inf) # return Infinite if not alignment found
        })

    }

    # Move res vector into M
    M <- matrix(unlist(res), nrow=length(mx), ncol=length(my), dimnames=list(names(mx), names(my)))

    # Unregister parallel support
    doParallel::stopImplicitCluster()
    print(paste("Execution time:", (proc.time() - ptm)[3]))

    # Save matrix into file
    if (!is.na(output)) {
        print("Storing computed matrix")
        saveRDS(M, file=name)
    }

    return(M)

}

cgvwzq/rlang-loophole documentation built on May 20, 2019, 5:10 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cgvwzq/rlang-loophole
Manage traces, timeseries, and classification for timing data extracted with Loophole

R/runCrossValidation.R
In cgvwzq/rlang-loophole: Manage traces, timeseries, and classification for timing data extracted with Loophole

Defines functions runCrossValidation executeCrossValidation parseDatabaseFilename printConfigData calculatePerformanceCross tracesMatch recognitionRate dtwDistParallel

Documented in calculatePerformanceCross dtwDistParallel dtwDistParallel executeCrossValidation parseDatabaseFilename parseDatabaseFilename printConfigData printConfigData recognitionRate recognitionRate runCrossValidation tracesMatch tracesMatch

R Package Documentation

Browse R Packages

We want your feedback!

cgvwzq/rlang-loophole Manage traces, timeseries, and classification for timing data extracted with Loophole

R/runCrossValidation.R In cgvwzq/rlang-loophole: Manage traces, timeseries, and classification for timing data extracted with Loophole

Defines functions runCrossValidation executeCrossValidation parseDatabaseFilename printConfigData calculatePerformanceCross tracesMatch recognitionRate dtwDistParallel

Documented in calculatePerformanceCross dtwDistParallel dtwDistParallel executeCrossValidation parseDatabaseFilename parseDatabaseFilename printConfigData printConfigData recognitionRate recognitionRate runCrossValidation tracesMatch tracesMatch

R Package Documentation

Browse R Packages

We want your feedback!

cgvwzq/rlang-loophole
Manage traces, timeseries, and classification for timing data extracted with Loophole

R/runCrossValidation.R
In cgvwzq/rlang-loophole: Manage traces, timeseries, and classification for timing data extracted with Loophole