R/statisticalTests.R
In markovchain: Easy Handling Discrete Time Markov Chains

Documented in assessOrder assessStationarity verifyEmpiricalToTheoretical verifyHomogeneity verifyMarkovProperty

#helper functions


#helper function for checkMP

.findNijPjk<-function(Nijk = Nijk, Nij = Nij, trans, row = 1){
  i <- Nijk[row,1]
  j <- Nijk[row,2]
  k <- Nijk[row,3]
  
  fromCh <- as.character(j)
  toCh <- as.character(k)
  Pjk <- trans[fromCh,toCh]
  
  m1 <- which(Nij[, 1] == i)
  m2 <- which(Nij[, 2] == j)
  m <- c(m1, m2)
  return(Nij[m[anyDuplicated(m)], 3] * Pjk)
}




#' @name verifyMarkovProperty
#' 
#' @rdname statisticalTests
#' @family statisticalTests
#' 
#' @title Various functions to perform statistical inference of DTMC
#' @description These functions verify the Markov property, assess 
#'              the order and stationarity of the Markov chain.
#' 
#' @param sequence An empirical sequence.
#' @param verbose Should test results be printed out?
#' @param nblocks Number of blocks.
#' 
#' @return Verification result
#' 
#' @references Anderson and Goodman.
#' 
#' @author Tae Seung Kang, Giorgio Alfredo Spedicato
#' 
#' @seealso \code{markovchain}
#' 
#' @examples 
#' sequence <- c("a", "b", "a", "a", "a", "a", "b", "a", "b",
#'               "a", "b", "a", "a", "b", "b", "b", "a")
#' mcFit <- markovchainFit(data = sequence, byrow = FALSE)
#' verifyMarkovProperty(sequence)
#' assessOrder(sequence)
#' assessStationarity(sequence, 1)
#' 
#' 
#' @export

# check if the sequence holds the Markov property
verifyMarkovProperty <- function(sequence, verbose = TRUE) {
  #warning("The accuracy of the statistical inference functions has been questioned. It will be thoroughly investigated in future versions of the package.")
  
  #fitting the markovchain
  
  transMatrix <- markovchainFit(data = sequence)$estimate@transitionMatrix
  
  #make the (n-2)x3 matrix for observations
  subSample<-sequence[1:(length(sequence) - (length(sequence)%%3))]
  
  seqSet1<-matrix(c(subSample[1:(length(subSample) - 2)],
                    subSample[2:(length(subSample) - 1)],
                    subSample[3:(length(subSample))]
  ),ncol = 3) #fill the matrix in reverse order so position 11 is the first obersvation,12 second and 13 third
  #compute row frequencies
  temp<-as.data.frame(seqSet1)
  Nijk<-aggregate(temp, by = temp, length)[1:(ncol(temp) + 1)]
  
  seqSet2 <- seqSet1[, -3] #make matrix of couples
  temp2 <- as.data.frame(seqSet2)
  Nij <- aggregate(temp2, by = temp2, length)[1:(ncol(temp2) + 1)] #rowfrequencies included
  
  
  test<-c(length = dim(Nijk)[1])
  #compute the test statistic
  invisible(lapply(seq_len(dim(Nijk)[1]),function(i)
    {
    foundNijPjk <- .findNijPjk(Nijk = Nijk, Nij = Nij, trans = transMatrix, row = i)
    test[i] <<- ((Nijk[i,4]-foundNijPjk)^2)/foundNijPjk
  })
  )
  statistic <- sum(test)
  #return value of the test statistic and test at confience level 95% and 99%
  
  #dof
  #Steps : No. of df = No. of triplets - No. of doubles + No. of observations - 1

#Creating vector of doubles/pairs
doubles = numeric(length(sequence)-1)
for(i in 1:(length(doubles))) {doubles[i] = paste(sequence[i], sequence[i+1], sep="", collapse = NULL)}

#Creating vector of triplets
triples = numeric(length(sequence)-2)
for(i in 1:(length(triples))) {triples[i] = paste(sequence[i], sequence[i+1], sequence[i+2], sep="", collapse = NULL)}

#Hence no. of df is---
dof = length(unique(triples)) - length(unique(doubles)) + length(unique(sequence)) - 1 

  
  
  
  pvalue <- 1-pchisq(q = statistic,df = dof)
  
  out <- list(statistic = statistic,dof = dof,p.value = pvalue)
  

  if (verbose == TRUE) {
    cat("Testing markovianity property on given data sequence\n")
    cat("Chi - square statistic is:", statistic, "\n")
    cat("Degrees of freedom are:", dof, "\n")
    cat("And corresponding p-value is:", pvalue, "\n")  
  }
  
  invisible(out)
}



#' @rdname statisticalTests
#' @export

# check if sequence is of first order or of second order
assessOrder <- function(sequence, verbose = TRUE) {
  warning("The accuracy of the statistical inference functions has been questioned. It will be thoroughly investigated in future versions of the package.")
  # length of sequence
  n <- length(sequence)
  
  # unique states
  states <- unique(sequence)
  
  # number of unique states
  nelements <- length(states)
  
  TStat <- 0
  for(present in states) {
    # going to be a transition matrix 
    mat <- zeros(nelements)
    dimnames(mat) <- list(states, states)
    
    # populate transition matrix
    for(i in 1:(n - 2)) {
      if(present == sequence[i + 1]) {
        past <- sequence[i]
        future <- sequence[i + 2]
        mat[past, future] <- mat[past, future] + 1 
      }
    }
    
    # chi-squared test
    res <- chisq.test(mat)
    TStat <- TStat + res$statistic
  }
  
  k <- nelements
  df <- k * (k - 1)^2
  pvalue <- 1-pchisq(q = TStat, df)
  out <- list(statistic = TStat[[1]], p.value = pvalue[[1]])
  
  # returning the output
  if (verbose == TRUE) {
    cat("The assessOrder test statistic is: ", TStat, "\n")
    cat("The Chi-Square d.f. are: ", df, "\n")
    cat("The p-value is: ", pvalue, "\n")
  }
  
  invisible(out)
}

#' @rdname statisticalTests
#' @export

# check if sequence is stationary
assessStationarity <- function(sequence, nblocks, verbose = TRUE) {
  warning("The accuracy of the statistical inference functions has been questioned. It will be thoroughly investigated in future versions of the package.")
  # length of sequence
  n <- length(sequence)
  
  # size of each block
  blocksize <- n / nblocks
  
  # vector of unique states
  states <- unique(sequence)
  
  # number of states
  nstates <- length(states) 
  
  # sum of the statistics
  TStat <- 0 
  
  # chi-squared test for each state
  for(i in states) {
    
    # init matrix
    mat <- matrix(0,nblocks, nstates)
    dimnames(mat) <- list(1:nblocks, states)
    
    # compute the transition matrix from sequence
    for(j in 1:(n - 1)) {
      if(sequence[j] == i) {
        # row index
        b <- ceiling(j / blocksize) 
        
        # next state
        future <- sequence[j+1] 
        
        # update transition matrix
        mat[b, future] <- mat[b, future] + 1 
      }
    }
    
    # vector to store row sum of matrix
    rowsums <- rowSums(mat)
    
    # store the indices with zero row sum
    indices <- which(rowsums == 0) 
    
    # update rows with zero sum
    for(k in indices) mat[k, ] <- 1/nstates 
    
    # update row sum after checking zero sum row
    rowsums <- rowSums(mat)
    
    # row-wise normalize. 
    mat <- mat/rowsums 
    
    # Some columns may still be all zeros. This causes NaN for chi-squared test.
    # chi-squared test
    res <- chisq.test(mat)
    TStat <- TStat + res$statistic
  }
  
  k <- nstates
  
  # degree of freedom
  df <- k * (nblocks - 1) * (k - 1)
  pvalue <- 1 - pchisq(TStat, df)
  
  # returning the output
  
  if (verbose==TRUE) {
    cat("The assessStationarity test statistic is: ", TStat, "\n")
    cat("The Chi-Square d.f. are: ", df, "\n")
    cat("The p-value is: ", pvalue, "\n")
  }
  
  out <- list(statistic = TStat[[1]], p.value = pvalue[[1]])
  
  invisible(out)
}

# sequence to transition frequencey matrix
.seq2mat <- function(sequence) {
  
  # basic requirement to create transition matrix
  n <- length(sequence)
  states <- unique(sequence)
  nstates <- length(states)
  
  # create transition matrix
  mat <- zeros(nstates)
  dimnames(mat) <- list(states, states)
  
  # populate transition matrix
  for(i in 1:(n - 1)) {
    from <- sequence[i]
    to <- sequence[i+1]
    mat[from, to] <- mat[from, to] + 1
  }
  
  return (mat)
}




#' @title test whether an empirical transition matrix is compatible to a theoretical one
#' 
#' @description This function tests whether an empirical transition matrix is statistically compatible
#' with a theoretical one. It is a chi-square based test. In case a cell in the empirical transition matrix is >0
#' that is 0 in the theoretical transition matrix the null hypothesis is rejected.
#' 
#' @rdname statisticalTests
#' @family statisticalTests
#'
#' @param data matrix, character or list to be converted in a raw transition matrix
#' @param object a markovchain object
#'
#' @return a list with following slots: statistic (the chi - square statistic), dof (degrees of freedom), and corresponding p-value. In case a cell in the empirical transition matrix is >0
#' that is 0 in the theoretical transition matrix the null hypothesis is rejected. In that case a p-value of 0 and statistic and dof of NA are returned.
#' @export
#'
#' @examples
#' 
#' #Example taken from Kullback Kupperman Tests for Contingency Tables and Markov Chains
#' 
#' sequence<-c(0,1,2,2,1,0,0,0,0,0,0,1,2,2,2,1,0,0,1,0,0,0,0,0,0,1,1,
#' 2,0,0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,2,1,0,
#' 0,2,1,0,0,0,0,0,0,1,1,1,2,2,0,0,2,1,1,1,1,2,1,1,1,1,1,1,1,1,1,0,2,
#' 0,1,1,0,0,0,1,2,2,0,0,0,0,0,0,2,2,2,1,1,1,1,0,1,1,1,1,0,0,2,1,1,
#' 0,0,0,0,0,2,2,1,1,1,1,1,2,1,2,0,0,0,1,2,2,2,0,0,0,1,1)
#' 
#' mc=matrix(c(5/8,1/4,1/8,1/4,1/2,1/4,1/4,3/8,3/8),byrow=TRUE, nrow=3)
#' rownames(mc)<-colnames(mc)<-0:2; theoreticalMc<-as(mc, "markovchain")
#' 
#' verifyEmpiricalToTheoretical(data=sequence,object=theoreticalMc)
#' 

verifyEmpiricalToTheoretical <- function(data, object, verbose = TRUE) {
  #warning("The accuracy of the statistical inference functions has been questioned. It will be thoroughly investigated in future versions of the package.")  
  if (!is(object,'markovchain')  ) stop("Error! Object should belong to the markovchain class")
  if (missing(data) | missing(object)) stop("Error! Required inputs missing")
  if (  !is.numeric(data) || is.character(data) || is.matrix(data)) stop("Error! Data should be either a raw transition matrix or 
                                                                  either a character or a numeric element")
  
  if (is.numeric(data)  || is.character(data) ) data<-createSequenceMatrix(stringchar = data, possibleStates = states(object))
  
  if (length(setdiff(names(data),names(object))) > 0) stop("Error! Empirical and theoretical tm have different support")
  
  # (possibly rearrange columns and rownames)
  
  data <- data[match(rownames(data),names(object)),] #matching rows
  data <- data[,match(colnames(data),names(object))] #matching cols
  
  if (sum((data == 0) == (object@transitionMatrix == 0)) == (nrow(data) * ncol(data))) {
    f_i_dot <-colSums(data)
    
    statistic <- 0
    
    for (i in 1:dim(object)) {
      for (j in 1:dim(object)) {
        if (data[i, j]>0&object[i, j]>0) statistic <- statistic + data[i, j]*log(data[i, j]/(f_i_dot[i]*object[i, j]))
      }
    }
    
    statistic <- statistic * 2
    
    null_elements <- sum(object@transitionMatrix == 0)
    
    dof <- dim(object) * (dim(object) - 1) - null_elements #r(r-1) - c, c null element ob objects
    
    p.value <- 1 - pchisq(q = statistic,df = dof)
    
    if (verbose == TRUE) {
      cat("Testing whether the\n");print(data);cat("transition matrix is compatible with\n");print(object@transitionMatrix);print("theoretical transition matrix")
      cat("ChiSq statistic is",statistic,"d.o.f are",dof,"corresponding p-value is",p.value,"\n")  
    }
    
    out <- list(statistic = statistic, dof = dof,pvalue = p.value)
  } else {
    statistic <- NA
    dof <- NA
    p.value <- 0
    
    if (verbose == TRUE) {
      cat("Testing whether the\n");print(data);cat("transition matrix is compatible with\n");print(object@transitionMatrix);print("theoretical transition matrix")
      cat("ChiSq statistic is",statistic,"d.o.f are",dof,"corresponding p-value is",p.value,"\n")
      cat("At least one transition is >0 in the data that is 0 in the object. Therefore the null hypothesis is rejected. \n") 
    }
    
    out <- list(statistic = statistic, dof = dof,pvalue = p.value)
  }

  #return output
  return(out)
}




.checkMatrix4Homogeneity<-function(matr) {
  out<-TRUE
  if (length(colnames(matr)) == 0) {message("Error! No colnames in input matrix"); out = FALSE}
  if (length(rownames(matr)) == 0) {message("Error! No rownames in input matrix"); out = FALSE}
  if (!all.equal(rownames(matr),colnames(matr))) {message("Error! Colnames <> Rownames")}
  if (any(matr<0)) {message("Error! Negative elements"); out = FALSE}
  return(out)
}

.addNamedColumns <- function(matr, fullnames) {
  if ( length( setdiff(names(matr),fullnames) )>0)  stop("Error! Names in matr not in fullnames")
  fullnames<-sort(fullnames)
  newMatr<-matrix(0,nrow = length(fullnames),ncol = length(fullnames),dimnames = list(fullnames,fullnames))
  
  current_support = colnames(matr)
  current_dim = dim(matr)
  
  for (i in 1:current_dim[1]) { #cycle on row
    for (j in 1:current_dim[2]) { #cycle on cols
      item<-matr[i,j] #take the element
      which_row_trans<-current_support[i] #define current row and cols
      which_col_trans<-current_support[j]
      # lookup element in the pooled table
      row_to_write <-match(x=which_row_trans,table = fullnames)
      col_to_write <-match(x=which_col_trans,table = fullnames)
      # write element into the pooled table
      newMatr[row_to_write,col_to_write] <- newMatr[row_to_write,col_to_write] + item
    }
  }

  return(newMatr)
}


#' @title Verify Homogeneity across transition matrices
#' 
#' @description Verifies that the s elements in the input list belongs to the same DTMC
#' 
#' @rdname statisticalTests
#' @family statisticalTests
#'
#' @param inputList A list of items that can coerced to transition matrices
#'
#' @return a list of transition matrices?
#' @export
#'
#' @examples
#' 
#' data(kullback)
#' verifyHomogeneity(inputList=kullback,verbose=TRUE)
#' 
verifyHomogeneity<-function(inputList, verbose = TRUE) {
  warning("The accuracy of the statistical inference functions has been questioned. It will be thoroughly investigated in future versions of the package.")  
  if (!is.list(inputList) ) stop("Error! inputList should be a string")
  if (length(inputList) < 2) stop("Error! inputList length lower than 2")
  
  #checks whether all inputs can be put as transition matrices
  
  for (i in 1:length(inputList)) {
    if (is.matrix(inputList[[i]]) == TRUE) {
      checks<-.checkMatrix4Homogeneity(inputList[[i]])
      if (!checks) stop("Error! Element ", i, " to be checked")
    } else {
      inputList[[i]]<-createSequenceMatrix(stringchar = inputList[[i]]) #convert all elements into transition matrices
    }
  }
  
  # create the pooled raw transition matrix and the matrix of rowsums
  all.names<-character()
  for (i in 1:length(inputList)) {
    all.names<-c(all.names, rownames(inputList[[i]]))
  }
  all.names<-sort(unique(all.names))
  ##initialize
  PooledRawTransitionMatrix <- matrix(0,nrow = length(all.names),ncol = length(all.names),dimnames = list(all.names, all.names))
  RowSumsMatrix <- matrix(0, nrow = length(inputList),ncol=length(all.names),dimnames = list(1:length(inputList),all.names))
  ##sum for each element in the list
  
  for (k in 1:length(inputList)) {
    current_support = rownames(inputList[[k]])
    current_dim = dim(inputList[[k]])
    
    for (i in 1:current_dim[1]) { #cycle on row
      for (j in 1:current_dim[2]) { #cycle on cols
        num_trans<-inputList[[k]][i, j] #take the element
        which_row_trans <- current_support[i] #define current row and cols
        which_col_trans <- current_support[j]
        # lookup element in the pooled table
        row_to_write <-match(x = which_row_trans,table = all.names)
        col_to_write <-match(x = which_col_trans,table = all.names)
        # write element into the pooled table
        PooledRawTransitionMatrix[row_to_write,col_to_write]=PooledRawTransitionMatrix[row_to_write,col_to_write]+num_trans
      }
    }
  }
  
  #create the matrix of rowsums fij.
  for (k in 1:length(inputList)) {
    my_row_sums <- rowSums(inputList[[k]])
    current_support = names(my_row_sums)
    for (i in 1:length(current_support)) {
      my_element<-my_row_sums[i]
      col_to_write<-match(x=current_support[i],table = all.names)
      RowSumsMatrix[k, col_to_write]<-RowSumsMatrix[k, col_to_write] + my_element
    }
  }
  
  # compute the chi - square statistic
  
  statistic <- 0
 # degreesOfFreedomLess <- 0
  newInputList <- lapply(inputList, .addNamedColumns,fullnames = all.names)
  number_of_transitions <- sapply(newInputList,sum)
  total_transitions <- sum(number_of_transitions)
  
  for (s in 1:length(inputList)) { #cycle across inputs
    for (j in 1:length(all.names)) { #cycle across rows
      for (k in 1:length(all.names)) { #cycle across cols
       if (any(newInputList[[s]][j,k] == 0, number_of_transitions[s] == 0, PooledRawTransitionMatrix[j,k] == 0)) {
         statistic <- statistic + 0 # zero element in log expr does not contribute to statistics 
#         degreesOfFreedomLess <- degreesOfFreedomLess +1
       } else {
         statistic <- statistic + newInputList[[s]][j, k] * log((total_transitions*newInputList[[s]][j, k])/(number_of_transitions[s]*PooledRawTransitionMatrix[j,k]))
       }
    }
    }
  }
  
  statistic <- statistic * 2
  #dof (s-1)*(r^2-1)-#zeros
  degrees_of_freedom <- (length(inputList) - 1)*(length(all.names)^2 - 1)#-degreesOfFreedomLess
  
  p.value <- 1 - pchisq(q = statistic,df = degrees_of_freedom)
  
  if (verbose == TRUE) {
    cat("Testing homogeneity of DTMC underlying input list \n")
    cat("ChiSq statistic is",statistic,"d.o.f are",degrees_of_freedom,"corresponding p-value is",p.value,"\n")  
  }
  
  #return output
  out <- list(statistic = statistic, dof = degrees_of_freedom,pvalue = p.value)
  return(out)
}
Any scripts or data that you put into this service are public.
markovchain documentation built on April 3, 2025, 9:59 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
markovchain
Easy Handling Discrete Time Markov Chains

R/statisticalTests.R
In markovchain: Easy Handling Discrete Time Markov Chains

Defines functions verifyHomogeneity .addNamedColumns .checkMatrix4Homogeneity verifyEmpiricalToTheoretical .seq2mat assessStationarity assessOrder verifyMarkovProperty .findNijPjk

Documented in assessOrder assessStationarity verifyEmpiricalToTheoretical verifyHomogeneity verifyMarkovProperty

Try the markovchain package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

markovchain Easy Handling Discrete Time Markov Chains

R/statisticalTests.R In markovchain: Easy Handling Discrete Time Markov Chains

Defines functions verifyHomogeneity .addNamedColumns .checkMatrix4Homogeneity verifyEmpiricalToTheoretical .seq2mat assessStationarity assessOrder verifyMarkovProperty .findNijPjk

Documented in assessOrder assessStationarity verifyEmpiricalToTheoretical verifyHomogeneity verifyMarkovProperty

Try the markovchain package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

markovchain
Easy Handling Discrete Time Markov Chains

R/statisticalTests.R
In markovchain: Easy Handling Discrete Time Markov Chains