Time Complexity Analysis in fastcpd
In fastcpd: Fast Change Point Detection via Sequential Gradient Descent

Introduction

This vignette examines two aspects of the fastcpd package:

Time Complexity of fastcpd.mean(): We assess how the execution time scales with the length of the data.
Impact of SeDG in fastcpd.lasso(): We simulate a Lasso regression setting with multiple change points and compare the detected change points under two settings of the vanilla_percentage parameter. This highlights the performance improvement provided by SeDG.

Time Complexity Simulation for `fastcpd.mean()`

In this section, we generate multivariate normal data with varying lengths and measure the execution time of the fastcpd.mean() function. We then create a log-log plot of the execution times and perform a linear regression on the log-transformed data to estimate the power law coefficient.

# Load necessary libraries
library(ggplot2)
library(fastcpd)

# Set a seed for reproducibility
set.seed(1)

# Define a sequence of data lengths
ns <- 1e+3 * c(1, 5, 10, 50, 100, 500, 1000, 5000, 1e+4, 5e+4)
p <- 4  # Dimensionality of the data

# Evaluate execution times for each data length
execution_times <- numeric(length(ns))

for (i in seq_along(ns)) {
  execution_times[i] <- system.time(fastcpd.mean(
    mvtnorm::rmvnorm(ns[i], mean = rep(0, p), sigma = diag(1, p)),
    r.progress = FALSE,
    cp_only = TRUE
  ))[[1]]
}

# Prepare data for plotting
time_data <- data.frame(
  n = ns,
  time = execution_times
)

# Plot execution times on a log-log scale
ggplot(time_data, aes(x = n, y = time)) +
  geom_point() +
  geom_line() +
  scale_x_log10() +
  scale_y_log10() +
  labs(
    title = "Time Complexity of fastcpd.mean",
    x = "Data Length (log10 scale)",
    y = "Execution Time (seconds, log10 scale)"
  ) +
  theme_minimal()

plot of chunk time-complexity

# Log-transform the data for linear regression
log_ns <- log10(ns)
log_times <- log10(execution_times)

# Perform linear regression to estimate the power coefficient
regression_model <- lm(log_times ~ log_ns)
summary(regression_model)
#> 
#> Call:
#> lm(formula = log_times ~ log_ns)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.18758 -0.11814  0.01454  0.03862  0.38958 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -5.27452    0.21144  -24.95 7.13e-09 ***
#> log_ns       0.92934    0.03814   24.37 8.58e-09 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.1757 on 8 degrees of freedom
#> Multiple R-squared:  0.9867, Adjusted R-squared:  0.985 
#> F-statistic: 593.8 on 1 and 8 DF,  p-value: 8.582e-09

# Extract and display the slope (power coefficient)
power_coefficient <- coef(regression_model)[2]
power_coefficient
#>   log_ns 
#> 0.929341

Impact of SeDG in `fastcpd.lasso()`

In this section, we simulate a Lasso regression model with change points. We compare the performance of the vanilla approach (without SeDG) and the SeDG-enhanced approach by varying the vanilla_percentage parameter. The detected change points are extracted for both settings.

# Load required libraries
library(fastcpd)

# Set seed for reproducibility
set.seed(1)

# Simulation parameters
n <- 480         # Total number of observations
p_true <- 6      # Number of true predictors with non-zero coefficients
p <- 50          # Total number of predictors

# Generate design matrix with n observations and p predictors
x <- mvtnorm::rmvnorm(n, rep(0, p), diag(p))

# Create true coefficient matrix for 4 segments
theta_0 <- rbind(
  runif(p_true, -5, -2),
  runif(p_true, -3, 3),
  runif(p_true, 2, 5),
  runif(p_true, -5, 5)
)
# Pad the coefficient matrix with zeros for the remaining predictors
theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4))

# Simulate response variable with change points across segments
y <- c(
  x[1:80, ] %*% theta_0[1, ] + rnorm(80, 0, 2),
  x[81:200, ] %*% theta_0[2, ] + rnorm(120, 0, 2),
  x[201:320, ] %*% theta_0[3, ] + rnorm(120, 0, 2),
  x[321:n, ] %*% theta_0[4, ] + rnorm(160, 0, 2)
)

# Combine response and predictors into a data frame
lasso_data <- data.frame(y = y, x = x)

# Detect change points using fastcpd.lasso without SeDG (vanilla_percentage = 0)
system.time(result_seg_non_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 0, r.progress = FALSE))
#>    user  system elapsed 
#>   5.556   0.794   6.364
cat("Change points with SeDG (vanilla_percentage = 0):\n")
#> Change points with SeDG (vanilla_percentage = 0):
result_seg_non_vanilla@cp_set
#> [1]  79 203 320

# Detect change points using fastcpd.lasso with the vanilla approach (vanilla_percentage = 1)
system.time(result_seg_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 1, r.progress = FALSE))
#>    user  system elapsed 
#> 112.649   5.302 118.156
cat("Change points with vanilla approach (vanilla_percentage = 1):\n")
#> Change points with vanilla approach (vanilla_percentage = 1):
result_seg_vanilla@cp_set
#> [1] 200 321

Notes

This document is generated by the following code:

R -e 'knitr::knit("vignettes/time-complexity.Rmd.original", output = "vignettes/time-complexity.Rmd")' && rm -rf vignettes/time-complexity && mv -f time-complexity vignettes

Appendix: all code snippets

knitr::opts_chunk$set(
  collapse = TRUE, comment = "#>", eval = TRUE, warning = FALSE,
  fig.path="time-complexity/"
)
library(fastcpd)
# Load necessary libraries
library(ggplot2)
library(fastcpd)

# Set a seed for reproducibility
set.seed(1)

# Define a sequence of data lengths
ns <- 1e+3 * c(1, 5, 10, 50, 100, 500, 1000, 5000, 1e+4, 5e+4)
p <- 4  # Dimensionality of the data

# Evaluate execution times for each data length
execution_times <- numeric(length(ns))

for (i in seq_along(ns)) {
  execution_times[i] <- system.time(fastcpd.mean(
    mvtnorm::rmvnorm(ns[i], mean = rep(0, p), sigma = diag(1, p)),
    r.progress = FALSE,
    cp_only = TRUE
  ))[[1]]
}

# Prepare data for plotting
time_data <- data.frame(
  n = ns,
  time = execution_times
)

# Plot execution times on a log-log scale
ggplot(time_data, aes(x = n, y = time)) +
  geom_point() +
  geom_line() +
  scale_x_log10() +
  scale_y_log10() +
  labs(
    title = "Time Complexity of fastcpd.mean",
    x = "Data Length (log10 scale)",
    y = "Execution Time (seconds, log10 scale)"
  ) +
  theme_minimal()

# Log-transform the data for linear regression
log_ns <- log10(ns)
log_times <- log10(execution_times)

# Perform linear regression to estimate the power coefficient
regression_model <- lm(log_times ~ log_ns)
summary(regression_model)

# Extract and display the slope (power coefficient)
power_coefficient <- coef(regression_model)[2]
power_coefficient
# Load required libraries
library(fastcpd)

# Set seed for reproducibility
set.seed(1)

# Simulation parameters
n <- 480         # Total number of observations
p_true <- 6      # Number of true predictors with non-zero coefficients
p <- 50          # Total number of predictors

# Generate design matrix with n observations and p predictors
x <- mvtnorm::rmvnorm(n, rep(0, p), diag(p))

# Create true coefficient matrix for 4 segments
theta_0 <- rbind(
  runif(p_true, -5, -2),
  runif(p_true, -3, 3),
  runif(p_true, 2, 5),
  runif(p_true, -5, 5)
)
# Pad the coefficient matrix with zeros for the remaining predictors
theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4))

# Simulate response variable with change points across segments
y <- c(
  x[1:80, ] %*% theta_0[1, ] + rnorm(80, 0, 2),
  x[81:200, ] %*% theta_0[2, ] + rnorm(120, 0, 2),
  x[201:320, ] %*% theta_0[3, ] + rnorm(120, 0, 2),
  x[321:n, ] %*% theta_0[4, ] + rnorm(160, 0, 2)
)

# Combine response and predictors into a data frame
lasso_data <- data.frame(y = y, x = x)

# Detect change points using fastcpd.lasso without SeDG (vanilla_percentage = 0)
system.time(result_seg_non_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 0, r.progress = FALSE))
cat("Change points with SeDG (vanilla_percentage = 0):\n")
result_seg_non_vanilla@cp_set

# Detect change points using fastcpd.lasso with the vanilla approach (vanilla_percentage = 1)
system.time(result_seg_vanilla <- fastcpd.lasso(lasso_data, vanilla_percentage = 1, r.progress = FALSE))
cat("Change points with vanilla approach (vanilla_percentage = 1):\n")
result_seg_vanilla@cp_set