R/ChangePoint.R

preselect.poisson.calc = function(data, pre) {
  if (pre >= length(data) | pre <= 0) {
    stop("Changepoint must be located between endpoints of variable range.")
  }
  singledim=function(data, pre){
    n=length(data)
    y=c(0,cumsum(data))
    if(y[n+1]==0){
      null=Inf
    }
    else{
      null=2*y[n+1]*log(n) - 2*y[n+1]*log(y[n+1])
    }
    taustar=2:(n-2)
    tmp=2*log(taustar)*y[taustar+1] -2*y[taustar+1]*log(y[taustar+1]) + 2*log(n-taustar)*(y[n+1]-y[taustar+1])-2*(y[n+1]-y[taustar+1])*log((y[n+1]-y[taustar+1]))
    if(sum(is.na(tmp))!=0){
      tmp[which(is.na(tmp))]=Inf
    }
    tau=pre
    taulike=tmp[pre-2+1]
    out=c(tau,null,taulike)
    names(out)=c('cpt','null','alt')
    return(out)
  }
  if(is.null(dim(data))==TRUE){
    cpt=singledim(data,pre)
    return(cpt)
  }
  else{
    rep=nrow(data)
    n=ncol(data)
    cpt=matrix(0,ncol=3,nrow=rep)
    for(i in 1:rep){
      cpt[i,]=singledim(data[i,],pre)
    }
    colnames(cpt)=c('cpt','null','alt')
    return(cpt)
  }
}

preselect.norm.mean.calc = function(data,pre){
  if (pre >= length(data) | pre <= 0) {
    stop("Changepoint must be located between endpoints of variable range.")
  }
  singledim=function(data,pre){
    n=length(data)
    y=c(0,cumsum(data))
    y2=c(0,cumsum(data^2))
    null=y2[n+1]-y[n+1]^2/n
    taustar=2:(n-2+1)
    tmp=y2[taustar+1]-y[taustar+1]^2/taustar + (y2[n+1]-y2[taustar+1]) - ((y[n+1]-y[taustar+1])^2)/(n-taustar)
    tau=pre
    taulike=tmp[pre-2+1]
    out=c(tau,null,taulike)
    names(out)=c('cpt','null','alt')
    return(out)
  }
  if(is.null(dim(data))==TRUE){
    cpt=singledim(data,pre)
    return(cpt)
  } else{
    rep=nrow(data)
    n=ncol(data)
    cpt=matrix(0,ncol=3,nrow=rep)
    for(i in 1:rep){
      cpt[i,]=singledim(data[i,],pre)
    }
    colnames(cpt)=c('cpt','null','alt')
    return(cpt)
  }
}

preselect.norm.var.calc = function(data,pre,know.mean,mu){
  if (pre >= length(data) | pre <= 0) {
    stop("Changepoint must be located between endpoints of variable range.")
  }
  if(is.null(dim(data))==TRUE){
    if((know.mean==FALSE)&(is.na(mu))){
      mu=mean(coredata(data))
    }
  } else {
    rep=nrow(data)
    if(length(mu)!=rep){
      mu=rep(mu,rep)
    }
    for(i in 1:rep){
      if((know.mean==FALSE)&(is.na(mu[i]))){
        mu=mean(coredata(data[i,]))
      }
    }
  }
  n=length(data)
  y=c(0,cumsum((data-mu)^2))
  null=n*log(y[n+1]/n)
  taustar=2:(n-2+1)
  sigma1=y[taustar+1]/taustar
  neg=sigma1<=0
  sigma1[neg==TRUE]=1*10^(-10)
  sigman=(y[n+1]-y[taustar+1])/(n-taustar)
  neg=sigman<=0
  sigman[neg==TRUE]=1*10^(-10)
  tmp=taustar*log(sigma1) + (n-taustar)*log(sigman)
  tau=pre
  taulike=tmp[pre-2+1]
  out=c(tau,null,taulike)
  names(out)=c('cpt','null','alt')
  return(out)
}

preselect.norm.meanvar.calc = function(data,pre){
  singledim=function(data,pre){
    n=length(data)
    y=c(0,cumsum(data))
    y2=c(0,cumsum((data)^2))
    null=n*log((y2[n+1]-(y[n+1]^2/n))/n)
    taustar=2:(n-2+1)
    sigma1=((y2[taustar+1]-(y[taustar+1]^2/taustar))/taustar)
    neg=sigma1<=0
    sigma1[neg==TRUE]=1*10^(-10)
    sigman=((y2[n+1]-y2[taustar+1])-((y[n+1]-y[taustar+1])^2/(n-taustar)))/(n-taustar)
    neg=sigman<=0
    sigman[neg==TRUE]=1*10^(-10)
    tmp=taustar*log(sigma1) + (n-taustar)*log(sigman)
    tau=pre
    taulike=tmp[pre-2+1]
    out=c(tau,null,taulike)
    names(out)=c('cpt','null','alt')
    return(out)
  }
  if(is.null(dim(data))==TRUE){
    cpt=singledim(data,pre)
    return(cpt)
  }
  else{
    rep=nrow(data)
    n=ncol(data)
    cpt=matrix(0,ncol=3,nrow=rep)
    for(i in 1:rep){
      cpt[i,]=singledim(data[i,],pre)
    }
    colnames(cpt)=c('cpt','null','alt')
    return(cpt)
  }
}

p.value <- function(v, t, cpt) {
  N = length(v)
  n = sum(v)
  num = (n/N)^n * ((N-n)/N)^(N-n)
  chex = min(which(t==cpt))
  N_1 = chex-1
  N_2 = N - chex
  n_1 = sum(v[1:N_1])
  n_2 = sum(v[(N-N_2):N])
  den = (n_1/N_1)^n_1 * ((N_1-n_1)/N_1)^(N_1-n_1) * (n_2/N_2)^n_2 * ((N_2-n_2)/N_2)^(N_2-n_2)
  lambda = num / den
  ts = -2*log(lambda)
  p = 1 - pchisq(ts, df=2)
  return(p)
}

#' Changepoint Model with Normal Distribution
#'
#' The changepoint model is used to test if there is a significant change in
#' mean and/or variance of a variable (assumed to have normal distribution),
#' over the course of some threshold, typically time or sequential order.
#'
#' @param v A vector that contains response variable of interest.
#' @param pre Numerical value that represents preselected changepoint for any
#'   model, should the user wish to test a specific point in data; NA by
#'   default.
#' @param type Choice between "Mean", "Var", and "MeanVar", regarding the
#'   parameter(s) for which you wish to test significant change in value;
#'   "MeanVar" by default.
#' @param know.mean (Valid only when \code{type="Var"}) Boolean variable, i.e.
#'   TRUE or FALSE, indicating whether or not you wish to set a value for "true
#'   mean" of data; FALSE by default.
#' @param mu (Valid only when \code{type="Var"} and \code{know.mean=TRUE})
#'   Numerical value that represents the "true mean" of data; NA by default.
#'
#' @return List of the following: \itemize{ \item{Table containing location(s)
#'   of changepoint(s) -- either preselected or determined rigorously -- ranked
#'   by significance and associated significance value(s)} \item{Numerical value
#'   representing the MBIC threshold by which to base significance of a
#'   changepoint, i.e. penalty values LARGER than MBIC threshold indicate
#'   significance.} }
#'
#' @examples
#' a = rnorm(100,0,25/100)
#' b = rep(0,100)
#' for (i in 1:100) {
#'   b[i] = rnorm(1,0,(i/2+25)/100)
#' }
#' c = rnorm(100,0,75/100)
#' d = rep(0,100)
#' for (i in 1:100) {
#'   d[i] = rnorm(1,0,(75-i/2)/100)
#' }
#' e = rnorm(100,0,25/100)
#' v = c(a,b,c,d,e)
#'
#' change.normal(v, type="Var", know.mean=TRUE, mu=0)
#' change.normal(v, type="Var", know.mean=TRUE, mu=0, pre=200)
#'
#' @export
change.normal = function(v, pre=NA, type="MeanVar", know.mean=FALSE, mu=NA) {
  if(is.null(dim(v))==TRUE) {
    n=length(v)
  } else{
    n=ncol(v)
  }
  if(n<4){stop('Data must have at least 4 observations to fit a changepoint model.')}
  if(n<(2*2)){stop('Minimum segment length is too large to include a change in this data')}
  if (is.na(pre) == FALSE) {
    if (type=="Mean") {
      cpt.pen = penalty_decision(penalty="MBIC", pen.value=0, n=n, diffparam=1, asymcheck="mean.norm", method="AMOC")
      if(is.null(dim(v))==TRUE) {
        tmp=preselect.norm.mean.calc(coredata(v),pre)
        tmp[3]=tmp[3]+log(tmp[1])+log(n-tmp[1]+1)
        alogn=(2*log(log(n)))^(-(1/2))
        blogn=(alogn^(-1))+(1/2)*alogn*log(log(log(n)))  # Chen & Gupta (2000) pg10
        p.value=1-exp(-2*(pi^(1/2))*exp(-alogn*sqrt(abs(tmp[2]-tmp[3]))+(alogn^{-1})*blogn))-exp(-2*(pi^(1/2))*exp((alogn^{-1})*blogn))
        my_row = list(c(tmp[1], tmp[2] - tmp[3], p.value))
        my_df = as.data.frame(do.call(rbind,my_row))
        colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value", "Changepoint p-Value")
        return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
      } else{
        tmp=preselect.norm.mean.calc(v,pre)
        tmp[,3]=tmp[,3]+log(tmp[,1])+log(n-tmp[,1]+1)
        alogn=(2*log(log(n)))^(-(1/2))
        blogn=(alogn^(-1))+(1/2)*alogn*log(log(log(n)))  # Chen & Gupta (2000) pg10
        p.value = 1-exp(-2*(pi^(1/2))*exp(-alogn*sqrt(abs(tmp[,2]-tmp[,3]))+(alogn^{-1})*blogn))-exp(-2*(pi^(1/2))*exp((alogn^{-1})*blogn))
        rep=nrow(v)
        my_row=list()
        for(i in 1:rep){
          my_row[[i]] = c(tmp[i,1], tmp[i,2] - tmp[i,3], p.value[i])
        }
        my_df = as.data.frame(do.call(rbind,my_row))
        colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value", "Changepoint p-Value")
        return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
      }
    } else if (type=="Var") {
      mu=mu[1]
      cpt.pen = penalty_decision(penalty="MBIC", pen.value=0, n=n, diffparam=1, asymcheck="var.norm", method="AMOC")
      if(is.null(dim(v))==TRUE){
        if((know.mean==FALSE)&(is.na(mu))){
          mu=mean(coredata(v))
        }
        tmp=preselect.norm.var.calc(coredata(v),pre,know.mean,mu)
        tmp[3]=tmp[3]+log(tmp[1])+log(n-tmp[1]+1)
        alogn=sqrt(2*log(log(n)))
        blogn=2*log(log(n))+ (log(log(log(n))))/2 - log(gamma(1/2))
        p.value=1-exp(-2*exp(-alogn*sqrt(abs(tmp[2]-tmp[3]))+blogn))-exp(-2*exp(blogn))  # Chen & Gupta (2000) pg27
        my_row = list(c(tmp[1], tmp[2] - tmp[3], p.value))
        my_df = as.data.frame(do.call(rbind,my_row))
        colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value", "Changepoint p-Value")
        return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
      } else{
        rep=nrow(v)
        tmp=matrix(0,ncol=3,nrow=rep)
        if(length(mu)!=rep){
          mu=rep(mu,rep)
        }
        for(i in 1:rep){
          if((know.mean==FALSE)&(is.na(mu[i]))){
            mu=mean(coredata(v[i,]))
          }
          tmp[i,]=preselect.norm.var.calc(v[i,],pre,know.mean,mu[i])
        }
        tmp[,3]=tmp[,3]+log(tmp[,1])+log(n-tmp[,1]+1)
        alogn=sqrt(2*log(log(n)))
        blogn=2*log(log(n))+ (log(log(log(n))))/2 - log(gamma(1/2))
        p.value=1-exp(-2*exp(-alogn*sqrt(abs(tmp[,2]-tmp[,3]))+blogn))-exp(-2*exp(blogn))
        my_row=list()
        for(i in 1:rep){
          my_row[[i]] = c(tmp[i,1], tmp[i,2] - tmp[i,3], p.value[i])
        }
        my_df = as.data.frame(do.call(rbind,my_row))
        colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value", "Changepoint p-Value")
        return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
      }
    } else if (type=="MeanVar") {
      cpt.pen = penalty_decision(penalty="MBIC", pen.value=0, n=n, diffparam=2, asymcheck="meanvar.norm", method="AMOC")
      if(is.null(dim(v))==TRUE){
        tmp=preselect.norm.meanvar.calc(coredata(v),pre)
        tmp[3]=tmp[3]+log(tmp[1])+log(n-tmp[1]+1)
        alogn=sqrt(2*log(log(n)))
        blogn=2*log(log(n))+ (log(log(log(n))))
        p.value=1-exp(-2*exp(-alogn*sqrt(abs(tmp[2]-tmp[3]))+blogn))-exp(-2*exp(blogn))
        my_row = list(c(tmp[1], tmp[2] - tmp[3], p.value))
        my_df = as.data.frame(do.call(rbind,my_row))
        colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value", "Changepoint p-Value")
        return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
      } else{
        tmp=preselect.norm.meanvar.calc(v,pre)
        tmp[,3]=tmp[,3]+log(tmp[,1])+log(n-tmp[,1]+1)
        alogn=sqrt(2*log(log(n)))
        blogn=2*log(log(n))+ (log(log(log(n))))
        p.value=1-exp(-2*exp(-alogn*sqrt(abs(tmp[,2]-tmp[,3]))+blogn))-exp(-2*exp(blogn))
        rep=nrow(v)
        my_row=list()
        for(i in 1:rep){
          my_row[[i]] = c(tmp[i,1], tmp[i,2] - tmp[i,3], p.value[i])
        }
        my_df = as.data.frame(do.call(rbind,my_row))
        colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value", "Changepoint p-Value")
        return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
      }
    } else {
      stop("Incorrect change type. Please select from 'Mean', 'Var', or 'MeanVar'.")
    }
  } else {
    if (type=="Mean") {
      cpt = cpt.mean(v, test.stat="Normal", method="BinSeg", Q = n / 2 + 1, minseglen=2)
    } else if (type=="Var") {
      cpt = cpt.var(v, test.stat="Normal", method="BinSeg", know.mean = know.mean, mu=mu, Q = n / 2 + 1, minseglen=2)
    } else if (type=="MeanVar") {
      cpt = cpt.meanvar(v, test.stat="Normal", method="BinSeg", Q = n / 2 + 1, minseglen=2)
    } else {
      stop("Incorrect change type. Please select from 'Mean', 'Var', or 'MeanVar'.")
    }
    my_df = list()
    j = 1
    for (i in 1:(n/2 + 1)) {
      if (cpt@pen.value.full[i] >= cpt@pen.value) {
        my_row = c(cpt@cpts.full[i,i], cpt@pen.value.full[i])
        my_df[[j]] = my_row
        j = j + 1
      }
    }
    if (length(my_df) != 0) {
      my_df = as.data.frame(do.call(rbind,my_df))
      colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value")
      return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt@pen.value))
    } else {
      return("No significant changepoints were found. If you wish to see the significance of a particular changepoint, please use the 'pre' parameter.")
    }
  }
}

#' Changepoint Model with Binary Discrete Distribution
#'
#' The changepoint model is used to test if there is a significant change in
#' binary probability of one outcome versus another, over the course of some
#' threshold, typically time or sequential order.
#'
#' @param v Vector that contains response variable of interest.
#' @param pre Numerical value that represents preselected changepoint (only one
#'   is allowed) for any model, should the user wish to test a specific point in
#'   data; NA by default.
#' @param t Vector that contains threshold variable over which to study change
#'   (ex. Time, outcome count in succession); default is sequence from 1 to
#'   total number of observations.
#' @param x Vector that contains hidden parameter, should you wish to test an
#'   interaction of original data with another variable, i.e. a hierarchical
#'   model; NA by default.
#'
#' @return Table containing location of changepoint -- either preselected or
#'   determined rigorously -- and associated significance value.
#'
#' @examples
#' r = rnorm(400,0,1) * 10
#' r1 = r[1:200]
#' r2 = r[201:400]
#' a = rbinom(200,1,
#'            exp(r1)/(1+exp(r1)))
#' b = rbinom(200,1,
#'            exp(r2+1)/(1+exp(r2+2)))
#' d = data.frame(x=r,y=c(a,b),t=1:400)
#'
#' change.binary(v = d$y, t= d$t, x = d$x)
#'
#' a = rbinom(200,1,0.3)
#' b = rbinom(200,1,0.7)
#' d = data.frame(t=1:400,y=c(a,b))
#'
#' change.binary(d$y, d$t)
#' change.binary(d$y, d$t, pre = 195)
#'
#' @export
change.binary = function(v, t = seq(1, length(v)), x = rep(NA, length(v)), pre=NA) {
  data = data.frame(v = v, t = t, x = x)
  if (is.na(pre)==TRUE) {
    if (all(is.na(x)) == TRUE) {
      cpt1 = chngptm(formula.1 = v ~ 1, formula.2=~t, data, family="binomial", type="step")
    } else {
      cpt1 = chngptm(formula.1 = v ~ x, formula.2=~t, data, family="binomial", type="step")
    }
    p.value = p.value(v, t, cpt1$coefficients[["chngpt"]])
    my_row = list(c(cpt1$coefficients[["chngpt"]], p.value))
    my_df = as.data.frame(do.call(rbind,my_row))
    colnames(my_df) = c("Changepoint Location", "Changepoint p-Value")
    return(my_df)
  } else {
    if (any(is.na(x)) == FALSE) {
      stop("Unfortunately, the preselected changepoint method does not work with a hidden variable x for the Binary statistic.")
    }
    p.value = p.value(v, t, pre)
    my_row = list(c(pre, p.value))
    my_df = as.data.frame(do.call(rbind,my_row))
    colnames(my_df) = c("Changepoint Location", "Changepoint p-Value")
    return(my_df)
  }
}

#' Changepoint Model with Poisson Distribution
#'
#' The changepoint model is used to test if there is a significant change in
#' distribution of a variable (assumed to have Poisson distribution), over the
#' course of some threshold, typically time or sequential order.
#'
#' @param v Vector that contains response variable of interest.
#' @param pre Numerical value that represents preselected changepoint for any
#'   model, should the user wish to test a specific point in data, NA by
#'   default.
#'
#' @return List of the following: \itemize{ \item{Table containing location(s)
#'   of changepoint(s) -- either preselected or determined rigorously -- ranked by
#'   significance and associated significance value(s)}
#'   \item{Numerical value representing the MBIC threshold by which to base
#'   significance of a changepoint, i.e. penalty values LARGER than MBIC
#'   threshold indicate significance.} }
#'
#' @examples #Examples you wish to use, in demonstration of function
#' a = rpois(200,1)
#' b = rpois(200,2)
#' c = rpois(200,3)
#' v = c(a,b,c)
#'
#' change.poisson(v)
#' change.poisson(v, pre=400)
#'
#' @export
change.poisson = function(v, pre=NA) {
  if((sum(v<0)>0)){stop('Poisson test statistic requires positive data')}
  if(sum(as.integer(v)==v)!=length(v)){stop('Poisson test statistic requires integer data')}
  if(is.null(dim(v))==TRUE){
    n=length(v)
  }
  else{
    n=ncol(v)
  }
  if(n<4){stop('Data must have at least 4 observations to fit a changepoint model.')}
  if(n<(2*2)){stop('Minimum segment length is too large to include a change in this data')}
  if (is.na(pre) == FALSE) {
    cpt.pen = penalty_decision(penalty="MBIC", pen.value=0, n=n, diffparam=1, asymcheck="meanvar.poisson", method="AMOC")
    if(is.null(dim(v))==TRUE){
      tmp=preselect.poisson.calc(coredata(v),pre)
      tmp[3]=tmp[3]+log(tmp[1])+log(n-tmp[1]+1)
      my_row = list(c(tmp[1], tmp[2] - tmp[3]))
      my_df = as.data.frame(do.call(rbind,my_row))
      colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value")
      return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
    }
    else{
      tmp=preselect.poisson.calc(v,pre)
      tmp[,3]=tmp[,3]+log(tmp[,1])+log(n-tmp[,1]+1)
      rep=nrow(v)
      my_row=list()
      for(i in 1:rep){
        my_row[[i]]== c(tmp[i,1], tmp[i,2] - tmp[i,3])
      }
      my_df = as.data.frame(do.call(rbind,my_row))
      colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value")
      return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt.pen))
    }
  } else {
    cpt = cpt.meanvar(v, test.stat="Poisson", method="BinSeg", minseglen=2, Q= n / 2 + 1)
    my_df = list()
    j = 1
    for (i in 1:(n/2 + 1)) {
      if (cpt@pen.value.full[i] >= cpt@pen.value) {
        my_row = c(cpt@cpts.full[i,i], cpt@pen.value.full[i])
        my_df[[j]] = my_row
        j = j + 1
      }
    }
    if (length(my_df) != 0) {
      my_df = as.data.frame(do.call(rbind,my_df))
      colnames(my_df) = c("Changepoint Location", "Changepoint Penalty Value")
      return(list(changepoint_data_frame = my_df, MBIC_penalty_threshold = cpt@pen.value))
    } else {
      return("No significant changepoints were found. If you wish to see the significance of a particular changepoint, please use the 'pre' parameter.")
    }
  }
}
hollicam/DiscreteChoiceModels documentation built on May 3, 2019, 8:59 p.m.