##########
# README #
##########
# This R script contain two implementations of algorithms to search for exact optimal change points
# Algorithm 1: Optimal Partitioning (Opt)
# Algorithm 2: Pruned Exact Linear Time (PELT)
# Both algorithm successfully detected change points with test continuous data
# Load Libraries
library(seqinr)
library(markovchain)
###############
## Algorithms #
###############
#' Optimal Partitioning
#' @param data = sequence of data
#' @param measure = a function measuring error of fiting (higher value = worse fitting)
#' @param beta = penalty
#' @param markov = whether we are modelling the data in markov, if yes, we return the trans prob of segments as well
#'
#' @return a list containing i) vector of change points ii) segments of data iii) transition prob of segments
opt = function(data, measure, beta, markov = TRUE){
# Initialization
n = length(data)
f = rep(0, n+1); f[1] = -beta # f is the optimization objective
cp = rep(list(c()), n + 1)
# Dynamic Programming
for (t_star in 2:n){
hold = c()
for (i in 1:(t_star - 1)){
hold = c(hold, f[i] + measure(data[(i + 1):t_star]) + beta)
}
f[t_star] = min(hold)
tau = which.min(hold)
cp[[t_star]] = c(cp[[tau]], tau)
}
# Results
if (length(cp[[n]]) == 1){
print("There are no change points detected")
}
else{
if (markov){
points = cp[[n]][-1]
ls = rep(list(c()), length(cp[[n]]) - 1)
P_ls = rep(list(), length(cp[[n]] - 1))
cp = c(1, cp[[n]][-1], length(data))
for (i in 1: (length(cp) - 1)){
ls[[i]] = (data[(cp[i]+1):cp[i+1]])
P_ls[[i]] = markovchainFit(data[(cp[i]+1):cp[i+1]])$estimate
}
return(list(cp = points, seg = ls, P_ls = P_ls))
}
else{
points = cp[[n]][-1]
ls = rep(list(c()), length(cp[[n]]) - 1)
cp = c(1, cp[[n]][-1], length(data))
for (i in 1: (length(cp) - 1)){
ls[[i]] = (data[(cp[i]+1):cp[i+1]])
}
return(list(cp = points, seg = ls, P_ls = NULL))
}
}
}
#' Pruned Exact Linear Time
#' @param data = sequence of data
#' @param measure = a function measuring error of fiting (higher value = worse fitting)
#' @param beta = penalty
#' @param K = pruning penalty
#' @param markov = whether we are modelling the data in markov, if yes, we return the trans prob of segments as well
#'
#' @return a list containing i) vector of change points ii) segments of data iii) transition prob of segments
PELT <- function(data, measure, beta, K, markov = TRUE){
# Initialization
n = length(data)
f = rep(0, n + 1); f[1] = -beta;
cp = rep(list(c()), n + 1)
R = rep(list(c()), n+1); R[[2]] = c(1)
# Main Algorithm
for (t_star in 2:(n)){
# step 1
hold = c();
for (item in length(R[[t_star]])){
hold = c(hold, f[R[[t_star]][item]] + measure(data[(R[[t_star]][item]+1):t_star]) + beta)
}
f[t_star] = min(hold)
# step 2
tau = R[[t_star]][which.min(hold)]
# step 3
cp[[t_star]] = c(cp[[tau]], tau)
# step 4
for (tau in R[[t_star]]){
if (f[tau] + measure(data[(tau + 1) : t_star]) + K <= f[t_star]){
R[[t_star + 1]] = c(R[[t_star + 1]], tau)
}
}
R[[t_star + 1]] = c(R[[t_star + 1]], t_star)
}
if (length(cp[[n]]) == 1){
print("There are no change points detected")
}
else{
if (markov){
points = cp[[n]][-1]
ls = rep(list(c()), length(cp[[n]]) - 1)
P_ls = rep(list(), length(cp[[n]] - 1))
cp = c(1, cp[[n]][-1], length(data))
for (i in 1: (length(cp) - 1)){
ls[[i]] = (data[(cp[i]+1):cp[i+1]])
P_ls[[i]] = markovchainFit(data[(cp[i]+1):cp[i+1]])$estimate
}
return(list(cp = points, seg = ls, P_ls = P_ls))
}
else{
points = cp[[n]][-1]
ls = rep(list(c()), length(cp[[n]]) - 1)
cp = c(1, cp[[n]][-1], length(data))
for (i in 1: (length(cp) - 1)){
ls[[i]] = (data[(cp[i]+1):cp[i+1]])
}
return(list(cp = points, seg = ls, P_ls = NULL))
}
}
}
#---------------------------------------------------------------------------
##########
## test ##
##########
# Normal data
data = rnorm(100,0,1)
data = c(data, rnorm(100, 10, 2))
data = c(data, rnorm(200, -10, 2))
measure = function(data){
if (length(data) == 1){
return (0)
}
else - sum(log(dnorm(data, mean = mean(data), sd = sqrt(var(data)))))
}
opt(data, measure, 8)
PELT(data, measure, 5, 3)
# Genomic data
dat <- read.fasta(file="changepoints/data/chrI.fa.gz")
dat2 <- unlist(dat, use.names=FALSE)
dat2f <- as.factor(dat2)
test = dat2f[200:600]
loglikhood <- function(data){
out = -markovchainFit(data)$logLikelihood
out
}
PELT(test, loglikhood, 0.55, 0.1)
opt(test, loglikhood, 8)
#-----------------------------------------
# Some diagnostic
test_long = dat2f[1:10000]
test_obj = PELT(test_long, loglikhood, 0.5, 0.1) # Pretty impressive, finished running within 4min
test_obj$cp
test_obj$seg
test_obj$P_ls
length(test_obj$cp)
test_obj$seg
test_obj$P_ls
# -------------------------------
# testing on truncated data
test_trunc = test[(test == "c" | test == "g")]
test_trunc_obj = opt(test_trunc, loglikhood, 2)
test_trunc_obj$cp
test_trunc_obj$seg
test
obj = opt(test, loglikhood, 8)
obj$cp
obj$seg
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.