Nothing
#' Cost function for Logistic regression, i.e. binomial family in GLM. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_binomial <- function(data, family = "binomial") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm::fastglm( as.matrix(data[, 1:p]), data[, p + 1], family = family ) return(out$deviance / 2) } #' Implementation of vanilla PELT for logistic regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) } cp <- cp_set[[n + 1]] nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_logistic_update <- function( data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- 1 / (1 + exp(-eta)) cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_binomial <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + log(1 + exp(u)) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_binomial <- function(data, beta, B = 10, trim = 0.025) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm::fastglm( as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = "binomial" ) coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) e_eta <- exp(coef %*% X1) const <- e_eta / (1 + e_eta)^2 cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) { cval[i] <- neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1)) } } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t / (1 + e_eta_t)^2 cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)] cp <- cp[-ind3] } nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) }
#' Cost function for Poisson regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_poisson <- function(data, family = "poisson") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family) return(out$deviance / 2) } #' Implementation of vanilla PELT for poisson regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) # if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- exp(eta) cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) coef <- pmin(pmax(coef, L), H) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_poisson <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + exp(u) + lfactorial(Y) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson") coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H) e_eta <- exp(coef %*% X1) const <- e_eta cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H) if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- pmin(pmax(coef.int[index[t], ], L), H) e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson") # nLL <- out$deviance/2 + nLL # } # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from poisson regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]) group <- rpois(length(mu), mu) y <- c(y, group) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) }
#' Cost function for penalized linear regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param lambda Penalty coefficient. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_lasso <- function(data, lambda, family = "gaussian") { data <- as.matrix(data) n <- dim(data)[1] p <- dim(data)[2] - 1 out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda) return(deviance(out) / 2) } #' Implementation of vanilla PELT for penalized linear regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 index <- rep(1:B, rep(n / B, B)) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family) # nLL <- deviance(out)/2 + nLL # } # output <- list(cp, nLL) output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' @param a Coefficient to be updated. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Updated coefficient. soft_threshold <- function(a, lambda) { sign(a) * pmax(abs(a) - lambda, 0) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] mu <- X_new %*% coef cmatrix <- cmatrix + X_new %o% X_new # B <- as.vector(cmatrix_inv%*%X_new) # cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B)) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix, lik_dev) nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm. coef <- soft_threshold(coef, lambda / nc) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_lasso <- function(data, b, lambda) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] resi <- Y - X %*% b L <- sum(resi^2) / 2 + lambda * sum(abs(b)) return(L) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ]) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef.int[i, ]) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) eta <- coef %*% X1 # c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon) # cmatrix_inv <- array(c_int, c(p,p,1)) cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] # cmatrix_inv_c <- cmatrix_inv[,,i] cmatrix_c <- cmatrix[, , i] k <- set[i] + 1 out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] # cmatrix_inv[,,i] <- out[[3]] cmatrix[, , i] <- out[[3]] if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] # cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3) cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] # cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries and merge change-points cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial") # nLL <- out$deviance/2 + nLL # } output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from penalized linear regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @param evar Error variance. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE] add <- Xb + rnorm(length(Xb), sd = sqrt(evar)) y <- c(y, add) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) }
set.seed(1) p <- 5 x <- matrix(rnorm(300 * p, 0, 1), ncol = p) # Randomly generate coefficients with different means. theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) # Randomly generate response variables based on the segmented data and # corresponding coefficients y <- c( rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ]))) ) segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp #> [1] 125 fastcpd.binomial( cbind(y, x), segment_count = 5, beta = "BIC", cost_adjustment = "BIC", r.progress = FALSE )@cp_set #> [1] 125 pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp #> [1] 0 125 fastcpd.binomial( cbind(y, x), segment_count = 5, vanilla_percentage = 1, beta = "BIC", cost_adjustment = "BIC", r.progress = FALSE )@cp_set #> [1] 125
set.seed(1) n <- 1500 d <- 5 rho <- 0.9 Sigma <- array(0, c(d, d)) for (i in 1:d) { Sigma[i, ] <- rho^(abs(i - (1:d))) } delta <- c(5, 7, 9, 11, 13) a.sq <- 1 delta.new <- delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta)) true.cp.loc <- c(375, 750, 1125) # regression coefficients true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1) true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2) true.coef[, 2] <- true.coef[, 1] + delta.new true.coef[, 3] <- true.coef[, 1] true.coef[, 4] <- true.coef[, 3] - delta.new out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma) data <- out[[1]] g_tr <- out[[2]] beta <- log(n) * (d + 1) / 2 segd_poisson( data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20 )$cp #> [1] 380 751 1136 1251 fastcpd.poisson( cbind(data[, d + 1], data[, 1:d]), beta = beta, cost_adjustment = "BIC", epsilon = 0.001, segment_count = 10, r.progress = FALSE )@cp_set #> [1] 380 751 1136 1251 pelt_vanilla_poisson(data, beta)$cp #> [1] 0 374 752 1133 fastcpd.poisson( cbind(data[, d + 1], data[, 1:d]), segment_count = 10, vanilla_percentage = 1, beta = beta, cost_adjustment = "BIC", r.progress = FALSE )@cp_set #> [1] 374 752 1133
set.seed(1) n <- 1000 s <- 3 d <- 50 evar <- 0.5 Sigma <- diag(1, d) true.cp.loc <- c(100, 300, 500, 800, 900) seg <- length(true.cp.loc) + 1 true.coef <- matrix(rnorm(seg * s), s, seg) true.coef <- rbind(true.coef, matrix(0, d - s, seg)) out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar) data <- out[[1]] beta <- log(n) / 2 # beta here has different meaning segd_lasso(data, beta, B = 10, trim = 0.025)$cp #> [1] 100 300 520 800 901 fastcpd.lasso( cbind(data[, d + 1], data[, 1:d]), epsilon = 1e-5, beta = beta, cost_adjustment = "BIC", pruning_coef = 0, r.progress = FALSE )@cp_set #> [1] 100 300 520 800 901 pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp #> [1] 100 #> [1] 200 #> [1] 300 #> [1] 400 #> [1] 500 #> [1] 600 #> [1] 700 #> [1] 800 #> [1] 900 #> [1] 1000 #> [1] 0 103 299 510 800 900 fastcpd.lasso( cbind(data[, d + 1], data[, 1:d]), vanilla_percentage = 1, epsilon = 1e-5, beta = beta, cost_adjustment = "BIC", pruning_coef = 0, r.progress = FALSE )@cp_set #> [1] 103 299 510 800 900
This document is generated by the following code:
R -e 'knitr::knit("vignettes/comparison-pelt.Rmd.original", output = "vignettes/comparison-pelt.Rmd")'
knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = TRUE, cache = FALSE, warning = FALSE, fig.width = 8, fig.height = 5 ) library(fastcpd) #' Cost function for Logistic regression, i.e. binomial family in GLM. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_binomial <- function(data, family = "binomial") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm::fastglm( as.matrix(data[, 1:p]), data[, p + 1], family = family ) return(out$deviance / 2) } #' Implementation of vanilla PELT for logistic regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) } cp <- cp_set[[n + 1]] nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_logistic_update <- function( data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- 1 / (1 + exp(-eta)) cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_binomial <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + log(1 + exp(u)) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_binomial <- function(data, beta, B = 10, trim = 0.025) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm::fastglm( as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = "binomial" ) coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) e_eta <- exp(coef %*% X1) const <- e_eta / (1 + e_eta)^2 cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) { cval[i] <- neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1)) } } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t / (1 + e_eta_t)^2 cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)] cp <- cp[-ind3] } nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) } #' Cost function for Poisson regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_poisson <- function(data, family = "poisson") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family) return(out$deviance / 2) } #' Implementation of vanilla PELT for poisson regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) # if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- exp(eta) cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) coef <- pmin(pmax(coef, L), H) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_poisson <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + exp(u) + lfactorial(Y) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson") coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H) e_eta <- exp(coef %*% X1) const <- e_eta cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H) if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- pmin(pmax(coef.int[index[t], ], L), H) e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson") # nLL <- out$deviance/2 + nLL # } # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from poisson regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]) group <- rpois(length(mu), mu) y <- c(y, group) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) } #' Cost function for penalized linear regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param lambda Penalty coefficient. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_lasso <- function(data, lambda, family = "gaussian") { data <- as.matrix(data) n <- dim(data)[1] p <- dim(data)[2] - 1 out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda) return(deviance(out) / 2) } #' Implementation of vanilla PELT for penalized linear regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 index <- rep(1:B, rep(n / B, B)) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family) # nLL <- deviance(out)/2 + nLL # } # output <- list(cp, nLL) output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' @param a Coefficient to be updated. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Updated coefficient. soft_threshold <- function(a, lambda) { sign(a) * pmax(abs(a) - lambda, 0) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] mu <- X_new %*% coef cmatrix <- cmatrix + X_new %o% X_new # B <- as.vector(cmatrix_inv%*%X_new) # cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B)) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix, lik_dev) nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm. coef <- soft_threshold(coef, lambda / nc) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_lasso <- function(data, b, lambda) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] resi <- Y - X %*% b L <- sum(resi^2) / 2 + lambda * sum(abs(b)) return(L) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ]) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef.int[i, ]) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) eta <- coef %*% X1 # c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon) # cmatrix_inv <- array(c_int, c(p,p,1)) cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] # cmatrix_inv_c <- cmatrix_inv[,,i] cmatrix_c <- cmatrix[, , i] k <- set[i] + 1 out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] # cmatrix_inv[,,i] <- out[[3]] cmatrix[, , i] <- out[[3]] if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] # cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3) cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] # cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries and merge change-points cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial") # nLL <- out$deviance/2 + nLL # } output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from penalized linear regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @param evar Error variance. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE] add <- Xb + rnorm(length(Xb), sd = sqrt(evar)) y <- c(y, add) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) } set.seed(1) p <- 5 x <- matrix(rnorm(300 * p, 0, 1), ncol = p) # Randomly generate coefficients with different means. theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) # Randomly generate response variables based on the segmented data and # corresponding coefficients y <- c( rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ]))) ) segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp fastcpd.binomial( cbind(y, x), segment_count = 5, beta = "BIC", cost_adjustment = "BIC", r.progress = FALSE )@cp_set pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp fastcpd.binomial( cbind(y, x), segment_count = 5, vanilla_percentage = 1, beta = "BIC", cost_adjustment = "BIC", r.progress = FALSE )@cp_set set.seed(1) n <- 1500 d <- 5 rho <- 0.9 Sigma <- array(0, c(d, d)) for (i in 1:d) { Sigma[i, ] <- rho^(abs(i - (1:d))) } delta <- c(5, 7, 9, 11, 13) a.sq <- 1 delta.new <- delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta)) true.cp.loc <- c(375, 750, 1125) # regression coefficients true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1) true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2) true.coef[, 2] <- true.coef[, 1] + delta.new true.coef[, 3] <- true.coef[, 1] true.coef[, 4] <- true.coef[, 3] - delta.new out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma) data <- out[[1]] g_tr <- out[[2]] beta <- log(n) * (d + 1) / 2 segd_poisson( data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20 )$cp fastcpd.poisson( cbind(data[, d + 1], data[, 1:d]), beta = beta, cost_adjustment = "BIC", epsilon = 0.001, segment_count = 10, r.progress = FALSE )@cp_set pelt_vanilla_poisson(data, beta)$cp fastcpd.poisson( cbind(data[, d + 1], data[, 1:d]), segment_count = 10, vanilla_percentage = 1, beta = beta, cost_adjustment = "BIC", r.progress = FALSE )@cp_set set.seed(1) n <- 1000 s <- 3 d <- 50 evar <- 0.5 Sigma <- diag(1, d) true.cp.loc <- c(100, 300, 500, 800, 900) seg <- length(true.cp.loc) + 1 true.coef <- matrix(rnorm(seg * s), s, seg) true.coef <- rbind(true.coef, matrix(0, d - s, seg)) out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar) data <- out[[1]] beta <- log(n) / 2 # beta here has different meaning segd_lasso(data, beta, B = 10, trim = 0.025)$cp fastcpd.lasso( cbind(data[, d + 1], data[, 1:d]), epsilon = 1e-5, beta = beta, cost_adjustment = "BIC", pruning_coef = 0, r.progress = FALSE )@cp_set pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp fastcpd.lasso( cbind(data[, d + 1], data[, 1:d]), vanilla_percentage = 1, epsilon = 1e-5, beta = beta, cost_adjustment = "BIC", pruning_coef = 0, r.progress = FALSE )@cp_set
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.