algorithms.R
In aaaaana/changepoints:

##########
# README #
##########
# This R script contain two implementations of algorithms to search for exact optimal change points
# Algorithm 1: Optimal Partitioning (Opt)
# Algorithm 2: Pruned Exact Linear Time (PELT)
# Both algorithm successfully detected change points with test continuous data

# Load Libraries
library(seqinr)
library(markovchain)

###############
## Algorithms #
###############

#' Optimal Partitioning
#' @param  data = sequence of data
#' @param  measure = a function measuring error of fiting (higher value = worse fitting)
#' @param  beta = penalty
#' @param  markov = whether we are modelling the data in markov, if yes, we return the trans prob of segments as well
#'
#' @return  a list containing i) vector of change points ii) segments of data iii) transition prob of segments

opt = function(data, measure, beta, markov = TRUE){
  
  # Initialization
  n = length(data)
  f = rep(0, n+1); f[1] = -beta # f is the optimization objective
  cp = rep(list(c()), n + 1)
  
  # Dynamic Programming
  for (t_star in 2:n){
    hold = c()
    for (i in 1:(t_star - 1)){
      hold = c(hold, f[i] + measure(data[(i + 1):t_star]) + beta)
    }
    f[t_star] = min(hold)
    tau = which.min(hold)
    cp[[t_star]] = c(cp[[tau]], tau)
  }
  
  # Results
  if (length(cp[[n]]) == 1){
    print("There are no change points detected")
  }
  else{
    if (markov){
      points = cp[[n]][-1]
      ls = rep(list(c()), length(cp[[n]]) - 1)
      P_ls = rep(list(), length(cp[[n]] - 1))
      cp = c(1, cp[[n]][-1], length(data))
      for (i in 1: (length(cp) - 1)){
        ls[[i]] = (data[(cp[i]+1):cp[i+1]])
        P_ls[[i]] = markovchainFit(data[(cp[i]+1):cp[i+1]])$estimate
      }
      return(list(cp = points, seg = ls, P_ls = P_ls))
    }
    else{
      points = cp[[n]][-1]
      ls = rep(list(c()), length(cp[[n]]) - 1)
      cp = c(1, cp[[n]][-1], length(data))
      for (i in 1: (length(cp) - 1)){
        ls[[i]] = (data[(cp[i]+1):cp[i+1]])
      }
      return(list(cp = points, seg = ls, P_ls = NULL))
    }
  }
}

#' Pruned Exact Linear Time
#' @param  data = sequence of data
#' @param  measure = a function measuring error of fiting (higher value = worse fitting)
#' @param  beta = penalty
#' @param  K = pruning penalty
#' @param  markov = whether we are modelling the data in markov, if yes, we return the trans prob of segments as well
#' 
#' @return  a list containing i) vector of change points ii) segments of data iii) transition prob of segments

PELT <- function(data, measure, beta, K, markov = TRUE){
  
  # Initialization
  n = length(data) 
  f = rep(0, n + 1); f[1] = -beta;
  cp = rep(list(c()), n + 1)
  R = rep(list(c()), n+1); R[[2]] = c(1)
  
  # Main Algorithm
  for (t_star in 2:(n)){
    # step 1
    hold = c();
    for (item in length(R[[t_star]])){
      hold = c(hold, f[R[[t_star]][item]] + measure(data[(R[[t_star]][item]+1):t_star]) + beta)
    }
    f[t_star] = min(hold)
    
    # step 2
    tau = R[[t_star]][which.min(hold)]
    
    # step 3
    cp[[t_star]] = c(cp[[tau]], tau)
    
    # step 4
    for (tau in R[[t_star]]){
      if (f[tau] + measure(data[(tau + 1) : t_star]) + K <= f[t_star]){
        R[[t_star + 1]] = c(R[[t_star + 1]], tau)
      }
    }
    R[[t_star + 1]] = c(R[[t_star + 1]], t_star)
  }
  if (length(cp[[n]]) == 1){
    print("There are no change points detected")
  }
  else{
    if (markov){
      points = cp[[n]][-1]
      ls = rep(list(c()), length(cp[[n]]) - 1)
      P_ls = rep(list(), length(cp[[n]] - 1))
      cp = c(1, cp[[n]][-1], length(data))
      for (i in 1: (length(cp) - 1)){
        ls[[i]] = (data[(cp[i]+1):cp[i+1]])
        P_ls[[i]] = markovchainFit(data[(cp[i]+1):cp[i+1]])$estimate
      }
      return(list(cp = points, seg = ls, P_ls = P_ls))
    }
    else{
      points = cp[[n]][-1]
      ls = rep(list(c()), length(cp[[n]]) - 1)
      cp = c(1, cp[[n]][-1], length(data))
      for (i in 1: (length(cp) - 1)){
        ls[[i]] = (data[(cp[i]+1):cp[i+1]])
      }
      return(list(cp = points, seg = ls, P_ls = NULL))
    }
  }
}

#---------------------------------------------------------------------------
##########
## test ##
##########

# Normal data
data = rnorm(100,0,1)
data = c(data, rnorm(100, 10, 2))
data = c(data, rnorm(200, -10, 2))
measure = function(data){
  if (length(data) == 1){
    return (0)
  }
  else - sum(log(dnorm(data, mean = mean(data), sd = sqrt(var(data)))))
}

opt(data, measure, 8) 
PELT(data, measure, 5, 3)

# Genomic data

dat <- read.fasta(file="changepoints/data/chrI.fa.gz")
dat2 <- unlist(dat, use.names=FALSE)
dat2f <- as.factor(dat2)
test = dat2f[200:600]

loglikhood <- function(data){
  out = -markovchainFit(data)$logLikelihood
  out
}

PELT(test, loglikhood, 0.55, 0.1)
opt(test, loglikhood, 8)


#-----------------------------------------
# Some diagnostic

test_long = dat2f[1:10000]
test_obj = PELT(test_long, loglikhood, 0.5, 0.1) # Pretty impressive, finished running within 4min

test_obj$cp 
test_obj$seg
test_obj$P_ls


length(test_obj$cp)
test_obj$seg
test_obj$P_ls


# -------------------------------
# testing on truncated data

test_trunc = test[(test == "c" | test == "g")]
test_trunc_obj = opt(test_trunc, loglikhood, 2)
test_trunc_obj$cp
test_trunc_obj$seg

test

obj = opt(test, loglikhood, 8)
obj$cp
obj$seg