R/MultipleChangePoint_NumKnown.R

Defines functions detect_multiple_cp

Documented in detect_multiple_cp

# Function to detect Multiple Change Points when number of change-points to detect is known
# Inputs:
# Data matrix X (n X d) : whose rows are the d dimensional observations
# Distance matrix (n X n)
# dist.method : Linkage method to be used in the hierarchical clustering (single, average or complete)
# numcp : Number of change-points to be detected (should be more than 1)
# Output:
# detectcp: Detected location vector for detected multiple change-points
#' Detecting multiple change-point locations for known number of change-points
#'
#' @param X Data matrix (n X p) where n denotes number of observations. Each row is a p dimensional observation vector. n observations are arranged in chronological order.
#' @param D Distance matrix (n X n) corresponding to the data matrix. Either the data matrix or the distance matrix should be supplied. Default is the Euclidean distance to construct the distance matrix when data matrix is supplied.
#' @param numcp Number of change-points to detect
#' @param dist.method Linkage method to use in hierarchical clustering for calculating the distances between consecutive clusters. This must be one of "single", "average" or "complete". Default is "average".
#'
#' @return Returns a numeric vector denoting estimated change-point locations
#' @export
#'
#' @examples
#' # Example 1
#' set.seed(1)
#' # Generate data matrix
#' X1 = matrix(rnorm((15 * 50), mean = 0, sd = 1), nrow = 15, ncol = 50)
#' X2 = matrix(rnorm((15 * 50), mean = 1, sd = 1), nrow = 15, ncol = 50)
#' X3 = matrix(rnorm((15 * 50), mean = 2, sd = 1), nrow = 15, ncol = 50)
#' X = rbind(X1, X2, X3)
#'
#' # Detect two change-points with default average linkage
#' detect_multiple_cp(X = X, numcp = 2)
#'
#' # Detect four change-points with default complete linkage
#' detect_multiple_cp(X = X, numcp = 4, dist.method = "complete")
#'
#' # Example 2
#' set.seed(1)
#' # Generate data matrix
#' X1 = matrix(rnorm((15 * 50), mean = 0, sd = 1), nrow = 15, ncol = 50)
#' X2 = matrix(rnorm((15 * 50), mean = 1, sd = 1), nrow = 15, ncol = 50)
#' X3 = matrix(rnorm((15 * 50), mean = 2, sd = 1), nrow = 15, ncol = 50)
#' X = rbind(X1, X2, X3)
#'
#' # Calculate distance matrix
#' D_mat = as.matrix(stats::dist(X, method = "euclidean"))
#'
#' # Only distance matrix is supplied
#' # Detect multiple change-points (here 2) with default average linkage
#' detect_multiple_cp(D = D_mat, numcp = 2)  # correct change-points are detected
detect_multiple_cp = function(X = NULL, D = NULL, numcp, dist.method = "average")
{
  # Check if either X or D is supplied or not
  if((is.null(X) == TRUE) && (is.null(D) == TRUE))
  {
    stop("Supply either data matrix or distance matrix")
  }

  # Check if Distance matrix is supplied or not, if supplied
  if(is.null(D) == FALSE)
  {
    if(nrow(D) != ncol(D))
    {
      stop("Incorrect supplied distance matrix")
    }

    dist_mat = D
  }else{
    # Calculate distance matrix
    dist_mat = as.matrix(stats::dist(X, method = "euclidean"))
  }

  # Number of observations
  n = nrow(dist_mat)

  # Compatibility check if numcp greater than 1 and less than n
  if((numcp == 1) || (numcp >= n))
  {
    stop("Incorrect number of change-points to detect.")
  }




  # Initialize the matrix with zeroes which will monitor merging of clusters
  trackclust_mat = matrix(0, n, n)

  # At initial stage one observation corresponds to one cluster
  trackclust_mat[1, ] = 1:n

  # Running loop till n - (numcp + 1) (since desired number of clusters is numcp + 1)
  for(j in 1:(n - (numcp + 1)))
  {
    # Since at each stage the number of clusters decreases by 1
    l = n - j

    # looking at current stage of number of clusters
    current_track = trackclust_mat[j, ]

    # Calculating linkage distances
    # loop till 1:l at each stage because we need to work with two consecutive clusters at a time
    link_dist = sapply(1:l, function(i) {

      # Considering two consecutive clusters at a time
      # One cluster index
      clust_index_1 = current_track[i]:(current_track[i+1]-1)
      # Immediate next cluster index
      clust_index_2 = current_track[i+1]:(ifelse( i < l, current_track[i+2] - 1, n))

      # Distance between the two consecutive clusters

      if(dist.method == "single")
      {
        # return single linkage value
        return(min(dist_mat[clust_index_1,clust_index_2]))
      } else if(dist.method == "average") {
        # return average linkage value
        return(mean(dist_mat[clust_index_1,clust_index_2]))
      }else{
        # return complete linkage value
        return(max(dist_mat[clust_index_1,clust_index_2]))
      }
    })

    # detecting which clusters are closest
    current_changepoint = which.min(link_dist)

    # Merging the clusters and storing the numbers from which only new clusters start
    trackclust_mat[j+1, ] = c(current_track[-(current_changepoint + 1)], 0)
  }

  # Extracting the detected multiple change-points (desired clusters numcp + 1)
  detect.multiple.cp = trackclust_mat[n - (numcp + 1) + 1, 1:(numcp + 1)] - 1
  detect.multiple.cp = detect.multiple.cp[detect.multiple.cp != 0]

  #Return detected change-points
  return(detect.multiple.cp)
}
DawnTrisha/hclustHDCP documentation built on Dec. 17, 2021, 4:10 p.m.