This vignette examines two aspects of the fastcpd package:
Time Complexity of fastcpd.mean()
:
We assess how the execution time scales with the length of the data.
Impact of SeDG in fastcpd.lasso()
:
We simulate a Lasso regression setting with multiple change points and compare the detected change points under two settings of the vanilla_percentage
parameter. This highlights the performance improvement provided by SeDG.
fastcpd.mean()
In this section, we generate multivariate normal data with varying lengths and measure the execution time of the fastcpd.mean()
function. We then create a log-log plot of the execution times and perform a linear regression on the log-transformed data to estimate the power law coefficient.
# Load necessary libraries library(ggplot2) library(fastcpd) # Set a seed for reproducibility set.seed(1) # Define a sequence of data lengths ns <- 1e+3 * c(1, 5, 10, 50, 100, 500, 1000, 5000, 1e+4, 5e+4) p <- 4 # Dimensionality of the data # Evaluate execution times for each data length execution_times <- numeric(length(ns)) for (i in seq_along(ns)) { execution_times[i] <- system.time(fastcpd.mean( mvtnorm::rmvnorm(ns[i], mean = rep(0, p), sigma = diag(1, p)), r.progress = FALSE, cp_only = TRUE ))[[1]] } # Prepare data for plotting time_data <- data.frame( n = ns, time = execution_times ) # Plot execution times on a log-log scale ggplot(time_data, aes(x = n, y = time)) + geom_point() + geom_line() + scale_x_log10() + scale_y_log10() + labs( title = "Time Complexity of fastcpd.mean", x = "Data Length (log10 scale)", y = "Execution Time (seconds, log10 scale)" ) + theme_minimal()
# Log-transform the data for linear regression log_ns <- log10(ns) log_times <- log10(execution_times) # Perform linear regression to estimate the power coefficient regression_model <- lm(log_times ~ log_ns) summary(regression_model) #> #> Call: #> lm(formula = log_times ~ log_ns) #> #> Residuals: #> Min 1Q Median 3Q Max #> -0.18758 -0.11814 0.01454 0.03862 0.38958 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) -5.27452 0.21144 -24.95 7.13e-09 *** #> log_ns 0.92934 0.03814 24.37 8.58e-09 *** #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> Residual standard error: 0.1757 on 8 degrees of freedom #> Multiple R-squared: 0.9867, Adjusted R-squared: 0.985 #> F-statistic: 593.8 on 1 and 8 DF, p-value: 8.582e-09 # Extract and display the slope (power coefficient) power_coefficient <- coef(regression_model)[2] power_coefficient #> log_ns #> 0.929341
fastcpd.lasso()
In this section, we simulate a Lasso regression model with change points. We compare the performance of the vanilla approach (without SeDG) and the SeDG-enhanced approach by varying the vanilla_percentage
parameter. The detected change points are extracted for both settings.
# Load required libraries library(fastcpd) # Set seed for reproducibility set.seed(1) # Simulation parameters n <- 480 # Total number of observations p_true <- 6 # Number of true predictors with non-zero coefficients p <- 50 # Total number of predictors # Generate design matrix with n observations and p predictors x <- mvtnorm::rmvnorm(n, rep(0, p), diag(p)) # Create true coefficient matrix for 4 segments theta_0 <- rbind( runif(p_true, -5, -2), runif(p_true, -3, 3), runif(p_true, 2, 5), runif(p_true, -5, 5) ) # Pad the coefficient matrix with zeros for the remaining predictors theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4)) # Simulate response variable with change points across segments y <- c( x[1:80, ] %*% theta_0[1, ] + rnorm(80, 0, 2), x[81:200, ] %*% theta_0[2, ] + rnorm(120, 0, 2), x[201:320, ] %*% theta_0[3, ] + rnorm(120, 0, 2), x[321:n, ] %*% theta_0[4, ] + rnorm(160, 0, 2) ) # Combine response and predictors into a data frame lasso_data <- data.frame(y = y, x = x) # Detect change points using fastcpd.lasso without SeDG (vanilla_percentage = 0) system.time(result_seg_non_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 0, r.progress = FALSE)) #> user system elapsed #> 5.556 0.794 6.364 cat("Change points with SeDG (vanilla_percentage = 0):\n") #> Change points with SeDG (vanilla_percentage = 0): result_seg_non_vanilla@cp_set #> [1] 79 203 320 # Detect change points using fastcpd.lasso with the vanilla approach (vanilla_percentage = 1) system.time(result_seg_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 1, r.progress = FALSE)) #> user system elapsed #> 112.649 5.302 118.156 cat("Change points with vanilla approach (vanilla_percentage = 1):\n") #> Change points with vanilla approach (vanilla_percentage = 1): result_seg_vanilla@cp_set #> [1] 200 321
This document is generated by the following code:
R -e 'knitr::knit("vignettes/time-complexity.Rmd.original", output = "vignettes/time-complexity.Rmd")' && rm -rf vignettes/time-complexity && mv -f time-complexity vignettes
knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = TRUE, warning = FALSE, fig.path="time-complexity/" ) library(fastcpd) # Load necessary libraries library(ggplot2) library(fastcpd) # Set a seed for reproducibility set.seed(1) # Define a sequence of data lengths ns <- 1e+3 * c(1, 5, 10, 50, 100, 500, 1000, 5000, 1e+4, 5e+4) p <- 4 # Dimensionality of the data # Evaluate execution times for each data length execution_times <- numeric(length(ns)) for (i in seq_along(ns)) { execution_times[i] <- system.time(fastcpd.mean( mvtnorm::rmvnorm(ns[i], mean = rep(0, p), sigma = diag(1, p)), r.progress = FALSE, cp_only = TRUE ))[[1]] } # Prepare data for plotting time_data <- data.frame( n = ns, time = execution_times ) # Plot execution times on a log-log scale ggplot(time_data, aes(x = n, y = time)) + geom_point() + geom_line() + scale_x_log10() + scale_y_log10() + labs( title = "Time Complexity of fastcpd.mean", x = "Data Length (log10 scale)", y = "Execution Time (seconds, log10 scale)" ) + theme_minimal() # Log-transform the data for linear regression log_ns <- log10(ns) log_times <- log10(execution_times) # Perform linear regression to estimate the power coefficient regression_model <- lm(log_times ~ log_ns) summary(regression_model) # Extract and display the slope (power coefficient) power_coefficient <- coef(regression_model)[2] power_coefficient # Load required libraries library(fastcpd) # Set seed for reproducibility set.seed(1) # Simulation parameters n <- 480 # Total number of observations p_true <- 6 # Number of true predictors with non-zero coefficients p <- 50 # Total number of predictors # Generate design matrix with n observations and p predictors x <- mvtnorm::rmvnorm(n, rep(0, p), diag(p)) # Create true coefficient matrix for 4 segments theta_0 <- rbind( runif(p_true, -5, -2), runif(p_true, -3, 3), runif(p_true, 2, 5), runif(p_true, -5, 5) ) # Pad the coefficient matrix with zeros for the remaining predictors theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4)) # Simulate response variable with change points across segments y <- c( x[1:80, ] %*% theta_0[1, ] + rnorm(80, 0, 2), x[81:200, ] %*% theta_0[2, ] + rnorm(120, 0, 2), x[201:320, ] %*% theta_0[3, ] + rnorm(120, 0, 2), x[321:n, ] %*% theta_0[4, ] + rnorm(160, 0, 2) ) # Combine response and predictors into a data frame lasso_data <- data.frame(y = y, x = x) # Detect change points using fastcpd.lasso without SeDG (vanilla_percentage = 0) system.time(result_seg_non_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 0, r.progress = FALSE)) cat("Change points with SeDG (vanilla_percentage = 0):\n") result_seg_non_vanilla@cp_set # Detect change points using fastcpd.lasso with the vanilla approach (vanilla_percentage = 1) system.time(result_seg_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 1, r.progress = FALSE)) cat("Change points with vanilla approach (vanilla_percentage = 1):\n") result_seg_vanilla@cp_set
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.