R/iid.test.R

Defines functions test.records records n.records test.iid.test iid.test.default iid.test.field iid.test.station iid.test

Documented in iid.test iid.test.default iid.test.field iid.test.station n.records records test.records

#' iid test & n.records
#' 
#' Test for whether a variable is independent and identically distributed
#' (iid). 
#' 
#' Reference:
#' 
#' Benestad, R.E., 2003: How often can we expect a record-event? Climate
#' Research. 23, 3-13 (pdf)
#' 
#' Benestad, R.E., 2004: Record values, nonstationarity tests and extreme value
#' distributions, Global and Planetary Change, vol 44, p. 11-26
#' 
#' The papers are available in the pdf format from
#' \url{http://regclim.met.no/results_iii_artref.html}.
#' 
#' Note, gaps of missing data (NA) can bias the results and produce an
#' under-count. The sign of non-iid behaviour is when the 'forward' analysis
#' indicated higher number of record-events than the confidence region and the
#' backward analysis gives lower than the confidence region.
#' 
#' Version 0.7: Added a test checking for dependencies based on an expected
#' number from a binomial distribution and given the probability p1(n) = 1/n.
#' This test is applied to the parallel series for one respective time
#' (realisation), and is then repeated for all observation times. The check
#' uses \code{\link{qbinom}} to compute a theoretical 95\% confidence interval,
#' and a number outside this range is marked with red in the 'ball diagram'
#' (first plot). \code{\link{pbinom}} is used to estimate the p-value for the
#' 
#' 
#' @aliases iid.test iid.test.station iid.test.field iid.test.default n.records records test.records
#' @param x A data matrix or a vector.
#' @param plot Flag: plot the diagnostics.
#' @param Monte.Carlo Flag: for estimating confidence limits.
#' @param N.test Number of Monre-Carlo runs.
#' @param reverse.plot.reverse TRUE: plots reverse from right to left, else
#' left to right.
#' @param \dots additional arguments
#'
#' @return list: 'record.density' and 'record.density.rev' for the reverse
#' analysis. The variables CI.95, p.val, and i.cluster (and their reverse
#' equivalents '.rev') return the estimated 95\% conf. int, p-value, and the
#' location of the clusters (binomial).
#' 
#' @keywords manip
#' @examples
#' 
#'   # takes a long time to run
#'   dat <- rnorm(100*30)
#'   dim(dat) <- c(100,30)
#'   iid.test(dat)
#' 
#'   print(n.records(rnorm(1000))$N)
#'   print(n.records(rnorm(1000))$N.rev)
#' @export iid.test
iid.test <- function(x,...) UseMethod("iid.test")

#' @exportS3Method
#' @export iid.test.station
iid.test.station <- function(x,...,verbose=TRUE) {
  # Re-orders the station data into parallel time series for each calendar
  # month into new matrix X. Then apply the iid.test to this matrix.
  
  #  ts2mon <- function(x,verbose=TRUE) {
  #    if (verbose) print('ts2mon')
  #    yrs <- year(x); n <- length(rownames(table(yrs)))
  #    d <- dim(x)
  #  # Test for multiple series:
  #    if (is.null(d)) m <- 1 else  # single series
  #                    m <- d[2]    # multiple
  #    X <- matrix(rep(NA,m*n*12),n,m*12)
  #    dim(X) <- c(n,m,12)
  #    if (verbose) print(dim(X))
  #  
  #    for (i in 1:12) {
  #      y <- subset(x,it=month.abb[i],verbose=verbose)
  #      y <- aggregate(y,year,FUN='max',na.rm=TRUE) # one estimate for each month/year
  #      if (verbose) print(dim(y))
  #      if (verbose) print(paste(month.abb[i],(1 + n-length(index(y))),
  #                               length((1 + n-length(index(y))):n)))
  #      if (verbose) print(length(index(y)))
  #      X[(1 + n-length(index(y))):n,1:m,i] <- coredata(y)
  #    }
  #    if (verbose) print('set dimensions')
  #    dim(X) <- c(n,m*12)
  #    attr(X,'description') <- 'data matrix re-orderd on month and location'
  #    attr(X,'original_dimensions') <- c(n,m,12)
  #    attr(X,'history') <- history.stamp(x)
  #    return(X)
  #  }
  
  
  if (verbose) print('iid.test.station')
  #X <- ts2mon(x,verbose=verbose)
  #if (verbose) print('weed out bad data')
  #good <- is.finite(rowMeans(X))
  X <- as.monthly(x,FUN='max')
  iid <- iid.test.default(X,verbose=verbose)
  invisible(iid)
}

#' @exportS3Method
#' @export iid.test.field
iid.test.field <- function(x,verbose=TRUE,...) {
  # Uses EOFs to account for spatial co-variance, and test the PCs rather
  # than the grid points.
  # Re-orders the PCs into parallel time series for each calendar
  # month into new matrix X. Then apply the iid.test to this matrix. 
  
  if (verbose) print('iid.test.field')
  yrs <- year(x); n <- length(rownames(table(yrs)))
  X <- matrix(rep(NA,20*12*n),n,12*20)
  for (i in 1:12) {
    eof <- EOF(subset(x,it=month.abb[i]))
    # The PCs and EOFs have a random sign. ensure that these are
    # consistent with the original data:
    # KMP 2019-05-28: replaced spatial.avg.field with aggregate.area
    #y <- spatial.avg.field(subset(x,it=month.abb[i]))
    y <- aggregate.area(subset(x,it=month.abb[i]), FUN="mean")
    #print(length(y)); dim(eof)
    r <- y %*% coredata(eof)
    #print(paste('inner-product: ',round(r,2)))
    eof[,r < 0] <- -eof[,r < 0]
    m <- length(index(eof))
    #str(eof)
    #print(c(n,m,(1 + n-m):n,1:20 + (i-1)*20));
    #print(dim(X[(1 + n-m):n,1:20 + (i-1)*20]))
    #print(dim(coredata(eof)))
    X[(1 + n-m):n,1:20 + (i-1)*20] <- coredata(eof)
  }
  good <- is.finite(rowMeans(X))
  iid <- iid.test.default(X[good,],verbose=verbose)
  invisible(iid)
}

#' @exportS3Method
#' @export iid.test.default
iid.test.default <- function(x,...,plot=TRUE,Monte.Carlo=TRUE,
                             N.test=200,rev.plot.rev=TRUE,verbose=TRUE) {
  if (verbose) print('iid.test.default')
  Y <- as.matrix(x)
  Y[!is.finite(Y)] <- NA
  t.r <- dim(Y)
  events <- matrix(rep(FALSE,t.r[1]*t.r[2]),t.r[1],t.r[2])
  events.rev <- events
  N.records <- rep(NA,t.r[2])
  # Use binomial distribution to look for suspicious clusters (dependencies)
  CI.95 <- rep(t.r[2],t.r[1]*2); dim(CI.95) <- c(t.r[1],2); CI.95.rev <- CI.95 
  p.val <- rep(NA,t.r[1]); i.cluster <- rep(FALSE,t.r[1])
  p.val.rev <- p.val; i.cluster.rev <- i.cluster
  
  if (plot) {
    par(col.axis="white")
    plot(c(1,t.r[1]),c(1,2*t.r[2]),type="n",main="iid-test",
         xlab="record length",ylab="location")
    par(col.axis="black")
    axis(1)
    lines(c(-5,t.r[1]+6),rep(t.r[2],2)+0.5,lwd=3)
    par.0 <- par(); par(srt=90)
    text(0,round(t.r[2]/2),"Forward",cex=1,vfont=c("sans serif","italic"))
    text(0,round(3*t.r[2]/2),"Backward",cex=1,vfont=c("sans serif","italic"))
    par(par.0)
  }
  
  for (ir in 1:t.r[2]) {
    #record.stats <- n.records(subset(x,is=ir))
    record.stats <- n.records(zoo(Y[,ir], order.by=index(Y)))
    if (verbose) str(record.stats)
    N.records[ir] <- record.stats$N
    events[record.stats$t,ir] <- TRUE
    events.rev[record.stats$t.rev,ir] <- TRUE
    if (plot) {
      # Timing index for record.
      t1 <- record.stats$t
      if (rev.plot.rev) {
        t2 <- record.stats$t.rev  
      } else {
        t2 <- t.r[1] - record.stats$t.rev + 1
      }
      #lines(c(1,t.r[1]),rep(ir,2),col="grey70")            
      points(t1,rep(ir,record.stats$N)+0.025,pch=20,cex=1.50,col="grey30")
      points(t1+0.05,rep(ir,record.stats$N)+0.050,pch=20,cex=0.70,col="grey50")
      points(t1+0.07,rep(ir,record.stats$N)+0.075,pch=20,cex=0.50,col="grey70")
      points(t1+0.1,rep(ir,record.stats$N)+0.100,pch=20,cex=0.30,col="white")   
      
      #lines(c(1,t.r[1]),rep(ir+t.r[2],2),col="grey70")
      points(t2,rep(ir,record.stats$N.rev)+0.025+t.r[2],pch=20,
             cex=1.50,col="grey30")
      points(t2+0.05,rep(ir,record.stats$N.rev)+0.050+t.r[2],pch=20,
             cex=0.70,col="grey50")
      points(t2+0.07,rep(ir,record.stats$N.rev)+0.075+t.r[2],pch=20,
             cex=0.50,col="grey70")
      points(t2+0.1,rep(ir,record.stats$N.rev)+0.100+t.r[2],pch=20,
             cex=0.30,col="white")
      if(!is.null(loc(x)[ir])) text(loc(x)[ir],0,ir,pos=3)
    }
  }
  
  if (plot) {
    for (it in 2:t.r[1]) {
      CI.95[it,] <- qbinom(p=c(0.025,0.975),size=t.r[2],prob=1/it)
      p.val[it] <- pbinom(events[it,],size=t.r[2],prob=1/it)
      if ( (sum(events[it,]) < CI.95[it,1]) |
           (sum(events[it,]) > CI.95[it,2]) ) {
        i.cluster[it] <- TRUE
        lines(rep(it,2),c(0,t.r[2]),lwd=1,lty=2,col=rgb(1,0.5,0.5,0.3))
      }
      CI.95.rev[it,] <- qbinom(p=c(0.025,0.975),size=t.r[2],prob=1/it)
      p.val.rev[it] <- pbinom(events[it,],size=t.r[2],prob=1/it)
      if ( (sum(events.rev[t.r[1]-it+1,]) < CI.95.rev[it,1]) |
           (sum(events.rev[t.r[1]-it+1,]) > CI.95.rev[it,2]) ) {
        i.cluster.rev[it] <- TRUE
        if (rev.plot.rev)
          lines(rep(t.r[1]-it+1,2),c(t.r[2]+1,2*t.r[2]),lwd=1,
                lty=2,col=rgb(1,0.5,0.5,0.3)) else
                  lines(rep(it,2),c(t.r[2]+1,2*t.r[2]),lwd=1,lty=2,col=rgb(1,0.5,0.5,0.3)) 
      }    
    }
  }
  
  events[!is.finite(Y)] <- NA
  events.rev[!is.finite(Y)] <- NA
  record.density <- rowMeans(events,na.rm=TRUE)
  record.density.rev <- rev(rowMeans(events.rev,na.rm=TRUE))
  N <- length(record.density)
  
  q025 <- rep(NA,N)
  q975 <- q025    
  if (Monte.Carlo) {
    print(paste("Please be patient -",N.test,"Monte Carlo runs in progress..."))
    record.mc <- rep(NA,2*N.test*N)
    dim(record.mc) <- c(N,N.test,2)
    for (ii in 1:N.test) {
      mc.stats <- test.iid.test(d=dim(events),plot=FALSE,Monte.Carlo=FALSE)
      record.mc[,ii,1] <- cumsum(mc.stats$record.density)
      record.mc[,ii,2] <- cumsum(mc.stats$record.density.rev)
    } 
    
    for (i in 1:N) {
      q025[i] <- quantile(record.mc[i,,],0.025)
      q975[i] <- quantile(record.mc[i,,],0.955)
    }
    sub <- paste("Shaded region= 95% conf.int. from Monte-Carlo with N=",N.test)
  } else {
    #    for (i in 1:N) {
    #      q025[i] <- sum(CI.95[1:i,1])/t.r[2]
    #      q975[i] <- sum(CI.95[1:i,2])/t.r[2]
    #    }     
    sub <- ""
  }
  
  if (plot) {
    dev.new(); par(col.axis="white")
    Time <- 1:N
    plot(Time,exp( cumsum( 1/(1:N)) ),type="l",lwd=3,col="grey60",
         xlab="Time",ylab="exp( cumsum( 1/(1:n)) )",
         main="Observed & Expected number of record-events",
         sub=sub)
    par(col.axis="black")
    axis(1)
    axis(2,at=exp(1:(2*sum(record.density,na.rm=TRUE))),
         labels=1:(2*sum(record.density,na.rm=TRUE)))
    legend(1,exp(sum(1/(1:N))),c("Theoretical","Forward","Backward"),
           pch=c(-1,19,21),lwd=c(3,1,1),lty=c(1,0,0),col=c("grey60",
                                                           rep("black",2)))
    
    if (Monte.Carlo) {
      polygon(c(Time,rev(Time)),c(exp(q025),rev(exp(q975))),
              col="grey90",border="grey85")
      lines(Time,exp( cumsum( 1/(1:N)) ),lwd=3,col="grey60")
    }
    grid()
    points(Time,exp(cumsum(record.density)),pch=19,cex=0.9)
    points(Time,exp(cumsum(record.density.rev)),pch=21,cex=0.9)
    
    
    dev.new()
    plot(Time,1/(1:N),type="n",xlab="Time",
         main="Observed & Expected record-occurence",
         sub="95% Confidence interval from binomial distribution",
         ylab="record-density")
    for (i in 1:N) {
      lines(rep(Time[i],2)-0.2,c(CI.95[i,1]/t.r[2],CI.95[i,2]/t.r[2]),lwd=2,
            col="grey40")
      lines(Time[i]+c(-0.2,0.2),rep(CI.95[i,1]/t.r[2],2),lwd=1,col="grey40")
      lines(Time[i]+c(-0.2,0.2),rep(CI.95[i,2]/t.r[2],2),lwd=1,col="grey40")
      lines(rep(Time[i],2)+0.2,c(CI.95.rev[i,1]/t.r[2],CI.95.rev[i,2]/t.r[2]),
            lwd=2,col="grey70")
      lines(Time[i]+c(-0.2,0.2),rep(CI.95.rev[i,1]/t.r[2],2),lwd=1,col="grey70")
      lines(Time[i]+c(-0.2,0.2),rep(CI.95.rev[i,2]/t.r[2],2),lwd=1,col="grey70")
    }
    points(Time,record.density,pch=20,cex=0.9,col="grey20")    
    points(Time,record.density.rev,pch=21,cex=0.9)
    lines(Time,1/(1:N),lwd=2,col="red")
  }
  
  results <- list(record.density=record.density,
                  record.density.rev=record.density.rev,
                  CI.95=CI.95,p.val=p.val,i.cluster=i.cluster,
                  CI.95.rev=CI.95.rev,p.val.rev=p.val.rev,
                  i.cluster.rev=i.cluster.rev)
  invisible(results)
}


# internal function - no need to export
test.iid.test <- function(distr="rnorm",d=c(70,50),plot=TRUE,
                          Monte.Carlo=TRUE) {
  rnd <- eval(parse(text=paste(distr,"(",d[1]*d[2],")",sep="")))
  dim(rnd) <- c(d[1],d[2])
  test.results <- iid.test(zoo(rnd,order.by=1:d[1]),
                           plot=plot,Monte.Carlo=Monte.Carlo)
  invisible(test.results)
}

#' @export n.records
n.records <- function(x,verbose=FALSE) {
  if (verbose) print('n.records')
  if (!inherits(x,'zoo')) x <- zoo(x,order.by=1:length(x))
  y <- x
  y[!is.finite(y)] <- min(y,na.rm=TRUE)
  if(is.null(dim(y))) {
    ## single time series
    if (length(rownames(table(y))) < 0.99 * length(y)) {
      print("---Warning---Warning---Warning---Warning---Warning---Warning---")
      print("r.records (iid.test): Warning, the time series contains many similar values!")
      print("The test does not work for cases where ties are common")
      print("See Benestad (2004) 'Record-values, non-stationarity tests and extreme value distributions' Global and Planetary Change, 44, 11-26")
      print("http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6VF0-4D6373Y-2&_coverDate=12%2F01%2F2004&_alid=228212815&_rdoc=1&_fmt=&_orig=search&_qd=1&_cdi=5996&_sort=d&view=c&_acct=C000056508&_version=1&_urlVersion=0&_userid=2181464&md5=632559476e84eb8c48287cf8038690d2")
    }
    y.rev <- rev(y)
    index(y.rev) <- index(y) # ensure that only numbers are reversed and not the time stamps
  } else {
    ## Field objects or multiple time series
    y.rev <- zoo(apply(y, 2, rev),order.by=index(y))
  }
  if (verbose) {
    str(y)
    str(y.rev)
  }

  if (verbose) print('fast algorithm')
  events <- records(y,verbose=verbose)
  if (verbose) print('reverse series')
  events.rev <- records(y.rev,verbose=verbose)
  
  if (is.numeric(events)) { 
    if (verbose) print('single series')
    N <- sum(is.finite(events)) 
    t <- attr(events,'t')
    N.rev <- sum(is.finite(events.rev))
    t.rev <- attr(events.rev,'t')
  } else if (is.list(events)) {
    if (verbose) print('matrix')
    N <- unlist(lapply(events,function(x) sum(is.finite(x))))
    t <- unlist(lapply(events,function(x) attr(x,'t')))
    N.rev <- unlist(lapply(events.rev,function(x) sum(is.finite(x))))
    t.rev <- unlist(lapply(events.rev,function(x) attr(x,'t')))
  } else stop(paste('n.records - not programmed to handle',class(events)))
  if (verbose) print('organise into list object')
  records <- list(N=N,t=t,events=events,N.rev=N.rev, 
                  t.rev=t.rev, events.rev=events.rev)
  invisible(records)
} 

## This algorithm is faster than the older code that used for-loop
## Search 'back-ward' statring with the highest value:
#' @export
records <- function(x,verbose=FALSE,diff=FALSE) {
  if (verbose) print('records')
  if (!is.null(dim(x))) {
    if (verbose) print('matrix: apply recursive routine')
    r <- apply(x,2,'records')
    return(r)
  }
  n <- length(x)
  r <- rep(NA,length(x)); t <- rep(NA,length(x))
  ii <- 1; i <- length(x)
  if (verbose) print(n)
  while(length(x) >= 1 & i > 1) {
    z <- max(x,na.rm=TRUE)
    r[ii] <- z
    i <- (1:n)[is.element(x,z)][1]
    t[ii] <- i; ii <- ii  + 1
    if (is.finite(i)) x <- x[1:(i-1)] else i <- 0
    if (verbose) print(c(z,ii,i,length(x)))
  }
  
  t <- t[is.finite(r)]
  r <- r[is.finite(r)]
  r <- rev(r)
  if (diff) r <- c(min(r),diff(r))
  attr(r,'t') <- rev(t)
  return(r)
}

#' @export test.records
test.records <- function(N=1000) {
  y <- rnorm(N) + seq(0,1,length=N)
  rbv <- records(y)
  plot(y,type='l')
  points(attr(rbv,'t'),rbv,col='red')
}
metno/esd documentation built on March 9, 2024, 11:21 a.m.