"MGBT" <- function(...) MGBT17c(...)
#COHN:P3_089 ****|==|====-====|====-====|====-====|====-====|====-====|====-====|==////////
#COHN:P3_089
#COHN:P3_089 Multiple Grubbs-Beck Test (MGBT)
#COHN:P3_089 Identifies up to n/2 low outliers
#COHN:P3_089 Modified 28 Jun 2016 to reflect MGBT [Stedinger, Cohn, etc.]
#COHN:P3_089
#COHN:P3_089 ****|==|====-====|====-====|====-====|====-====|====-====|====-====|==////////
#COHN:P3_089
#COHN:P3_089MGBT <- function(Q,Alphaout=0.005,Alphazeroin=0.10,n2=floor(length(Q)/2)){
#COHN:P3_089 zt <- sort(log10(pmax(1e-8,Q)))
#COHN:P3_089 n <- length(zt)
#COHN:P3_089 pvalueW <-rep(-99,n2);w<-rep(-99,n2)
#COHN:P3_089 j1=0;j2=0
#COHN:P3_089 for(i in 1:n2) {
#COHN:P3_089 w[i]<-(zt[i]-mean(zt[(i+1):n]))/sqrt(var(zt[(i+1):n]))
#COHN:P3_089 pvalueW[i]<-KthOrderPValueOrthoT(n,i,w[i])$value
#COHN:P3_089 if(pvalueW[i]<Alphaout){j1<-i;j2<-i}
#COHN:P3_089 if( (pvalueW[i]<Alphazeroin) & (j2==i-1)){j2<-i}
#COHN:P3_089 }
#COHN:P3_089 return(list(klow=j2,pvalues=pvalueW,LOThresh=ifelse(j2>0,sort(Q)[j2+1],0)))
#COHN:P3_089}
"MGBTcohn2016" <- # arguments match MGBTcohn2013 but the alphas were renamed
function(x, alphaout=0.005, alphazeroin=0.10, n2=floor(length(x)/2),
napv.zero=TRUE, offset=0) {
x <- x[! is.na(x)]
# WHA notes that by TAC using 1e-8 as "small", he has assumed that the MGBT will
# always identify these as low outliers. Safe assumption in flood peaks, but
# it is a weakness. WHA would have kept them as -Inf (we are in R and not
# FORTRAN) and dealt with -Inf hitting subcomponents of the MBGT and trapping
# the issue there.
if(length(unique(x)) == 1) { # WHA error trapping
return(list(pvalues=NA, klow=0, LOThresh=0)) # WHA error trapping
} # WHA error trapping
zt <- sort( log10( pmax(1e-8,x)) ) # matches MGBTcohn2013
n <- length(zt) # WHA: n <- length(x) would be bit cleaner # matches MGBTcohn2013
pvalueW <- w <- rep(NA, n2) # matches MGBTcohn2013
j1 <- j2 <- 0 # matches MGBTcohn2013
for(i in 1:n2) { # matches MGBTcohn2013
w[i] <- (zt[i] - mean(zt[(i+1):n])) / sqrt(var(zt[(i+1):n])) # matches MGBTcohn2013
pvalueW[i] <- RthOrderPValueOrthoT(n, i, w[i])$value # matches MGBTcohn2013
if( is.na(pvalueW[i])) { # WHA error trapping
pvalueW[i] <- ifelse(napv.zero, 0, NA) # WHA error trapping
if(is.na(pvalueW[i])) next # WHA error trapping
} # WHA error trapping
# Next line as long as the p-value is less than alpha_out, we continue to
# index up because of the direction of the for() loop. This indexing up has the
# same effect as "sweeping out" from the median (well if n2=length(x)/2) as
# discussed in 17C so j1 will always attain the kth-largest low values having
# a p-value smaller than alpha_out. We can conclude from this that 17C is
# being followed in this regard and that alphaout is the same as LaTeX \alpha_out
# in 17C. Notice that alphaout will skip over low values whose p-value might not
# be small enough and declare them as outliers. This then seems to be a greedy approach.
# We must conclude that j1 then is the index associated with the outward sweep through
# in implementation, we are moving up the order statistics. 17C says that the outward
# sweep "STOPS" but in implementation this is not what happens. TAC is dealing with
# both alphas in one loop!
if((pvalueW[i] < alphaout)) { j1 <- i; j2 <- i } # matches MGBTcohn2013
# TAC seems to break from 17C in the notation that follows. First, this alpha must
# be LaTeX \alpha_in in 17C. Because j2 gets set to i if the condition is jointly
# true, then it can be seen that it is more visually sweeping in towards the median.
# The (j2==(i-1)) is harder to "see" but carefully considering its purpose, it
# insures that one has a solid continuity of low outliers from i=1 on up to j2. Hence,
# it is less greedy than j1 by requiring this. j2 must be associated with sweep in as
# j1 is dealing with sweep out. 17C says that the inward sweep "STOPS" but in
# implementation this is not what happens. TAC is dealing with both alphas in one loop!
if((pvalueW[i] < alphazeroin) & (j2==i-1)) j2 <- i # matches MGBTcohn2013
}
# TAC does not include a recursion back to MGBTcohn2016() for the
# condition j2 == n2 (see MGBTcohn2013) at this point in the code.
# ---***--------------***--- TAC CRITICAL BUG ---***--------------***---
# j2 <- min(c(j1,j2)) # WHA tentative completion of the 17C alogrithm!?!
# HOWEVER MAJOR WARNING. WHA is using a minimum and not a maximum!!!!!!!
# See MGBT17C() below. In that if the line a few lines above that reads
# if((pvalueW[i] < alpha1 )) { j1 <- i; j2 <- i }
# is replaced with if((pvalueW[i] < alpha1 )) j1 <- i
# then maximum and not the minimum becomes applicable.
# ---***--------------***--- TAC CRITICAL BUG ---***--------------***---
zz <- list(pvalues=pvalueW, klow=j2, # matches MGBTcohn2013
LOThresh=ifelse(j2 > 0, sort(x)[j2+1]+offset, 0))# quasi matches MGBTcohn2013
# WHA: What bothers me is that j1 does not seem to get involved in the final result. Just
# by quick inspection, this does not seem symmetrical. 17C says "the number of
# low outliers identified by [MGBT] is the larger of k and m-1" The "m" appears to be
# j2 and j1 appears to be the k. TAC does not compute the maximum (larger there of) and
# with this conclusion is with evidence of several code bundles available verbatim---
# IS IT POSSIBLE THAT TAC DID NOT COMPLETE THE 17C ALGORITHM IN MGBT?
# Looking closely without yet proof, m-1 does not translate to j2-1 given the
# structure of TACs algorithm.
zz$LOThresh[zz$LOThresh < 0] <- 0 # WHA 02/04/2019
return(zz) # the offset is by WHA # matches MGBTcohn2013
}
#COHN ****|==|====-====|====-====|====-====|====-====|====-====|====-====|==////////
#COHN
#COHN Multiple Grubbs-Beck Test (MGBT)
#COHN Identifies up to n/2 low outliers
#COHN
#COHN ****|==|====-====|====-====|====-====|====-====|====-====|====-====|==////////
#COHN
#COHN MGBT <- function(Q,alpha1=0.005,alpha10=0.10,n2=floor(length(Q)/2)){
#COHN zt <- sort(log10(pmax(1e-8,Q)))
#COHN n <- length(zt)
#COHN pvalueW <-rep(-99,n2);w<-rep(-99,n2)
#COHN j1=0;j2=0
#COHN for(i in 1:n2) {
#COHN w[i]<-(zt[i]-mean(zt[(i+1):n]))/sqrt(var(zt[(i+1):n]))
#COHN pvalueW[i]<-KthOrderPValueOrthoT(n,i,w[i])$value
#COHN if(pvalueW[i]<alpha1){j1<-i;j2<-i}
#COHN if( (pvalueW[i]<alpha10) & (j2==i-1)){j2<-i}
#COHN }
#COHN if(j2==n2){
#COHN if(n2<length(Q)-5) { # set a limit of at least 5 retained observations
#COHN print(paste(" Number of low outliers equals or exceeds limit of ",n2));
#COHN return(MGBT(Q,alpha1=0.01,alpha10=0.10,n2=length(Q)-5))} # try this
#COHN else
#COHN print("MGBT identifies too many low outliers; use caution and judgment")
#COHN }
#COHN return(list(klow=j2,pvalues=pvalueW,LOThresh=ifelse(j2>0,sort(Q)[j2+1],0)))
#COHN }
"MGBTcohn2013" <-
function(x, alphaout=0.005, alphazeroin=0.10, n2=floor(length(x)/2),
napv.zero=TRUE, offset=0) {
x <- x[! is.na(x)]
if(length(unique(x)) == 1) { # WHA error trapping
return(list(pvalues=NA, klow=0, LOThresh=0)) # WHA error trapping
} # WHA error trapping
zt <- sort( log10( pmax(1e-8,x)) )
n <- length(zt) # WHA: n <- length(x) could be used instead and a bit cleaner
pvalueW <- w <- rep(NA, n2)
j1 <- j2 <- 0
for(i in 1:n2) {
w[i] <- (zt[i] - mean(zt[(i+1):n])) / sqrt(var(zt[(i+1):n]))
pvalueW[i] <- RthOrderPValueOrthoT(n, i, w[i])$value
if( is.na(pvalueW[i])) { # WHA error trapping
pvalueW[i] <- ifelse(napv.zero, 0, NA) # WHA error trapping
if(is.na(pvalueW[i])) next # WHA error trapping
} # WHA error trapping
if((pvalueW[i] < alphaout )) { j1 <- i; j2 <- i }
if((pvalueW[i] < alphazeroin) & (j2==i-1) ) j2 <- i
}
# ---***--------------***--- TAC CRITICAL BUG ---***--------------***---
# j2 <- min(c(j1,j2)) # WHA tentative completion of the 17C alogrithm!?!
# HOWEVER MAJOR WARNING. WHA is using a minimum and not a maximum!!!!!!!
# See MGBT17C() below. In that if the line a few lines above that reads
# if((pvalueW[i] < alpha1 )) { j1 <- i; j2 <- i }
# is replaced with if((pvalueW[i] < alpha1 )) j1 <- i
# then maximum and not the minimum becomes applicable.
# ---***--------------***--- TAC CRITICAL BUG ---***--------------***---
# Curious that TAC does two length(x) below, but that length is already known.
if(j2 == n2) { # TODO, does TAC change his mind on 5 retained observations?
# Also alphaout changes.
if( n2 < n-5 ) { # set limit of at least 5 retained below 1/2 the sample
message( paste0(" Number of low outliers equals or exceeds limit of ",n2) )
zz <- MGBTcohn2013(x, alphaout=0.01, alphazeroin=0.10, n2=n-5,
napv.zero=napv.zero) # recursion
return(zz)
} else {
message("MGBT identifies too many low outliers, use caution and judgment")
}
}
zz <- list(pvalues=pvalueW, klow=j2,
LOThresh=ifelse(j2 > 0, sort(x)[j2+1]+offset, 0))# quasi matches MGBTcohn2016
zz$LOThresh[zz$LOThresh < 0] <- 0 # WHA 02/04/2019
return(zz) # the offset is by WHA
}
#COHN ****|==|====-====|====-====|====-====|====-====|====-====|====-====|==////////
#COHN
#COHN Multiple Grubbs Beck test (Cohn et al., 2011)
#COHN p. 169
#COHN MGB <- function(x,k,n=length(x)){
#COHN y=sort(x);(y[k]-mean(y[(k+1):n]))/sqrt(var(y[(k+1):n]))}
"MGBTcohn2011" <- function(x, r, n=length(x)) {
x <- x[! is.na(x)]
y <- sort(x)
( y[r] - mean(y[(r+1):n]) ) / sqrt( var(y[(r+1):n]) )
}
#COHN ****|==|====-====|====-====|====-====|====-====|====-====|====-====|==////////
#COHN
#COHN Multiple Grubbs-Beck Test (MGBTnb)
#COHN This eliminates the backup procedure
#COHN Added for JL/JRS study 24 Aug 2012 (TAC)
#COHN
#COHN ****|==|====-====|====-====|====-====|====-====|====-====|====-====|==////////
#COHN
#COHN MGBTnb <- function(Q,alpha1=0.005,alpha10=0.10,n2=floor(length(Q)/2)){
#COHN zt <- sort(log10(pmax(1e-8,Q)))
#COHN n <- length(zt)
#COHN pvalueW <-rep(-99,n2);w<-rep(-99,n2)
#COHN j1=0;j2=0
#COHN for(i in 1:n2) {
#COHN w[i]<-(zt[i]-mean(zt[(i+1):n]))/sqrt(var(zt[(i+1):n]))
#COHN pvalueW[i]<-KthOrderPValueOrthoT(n,i,w[i])$value
#COHN if(pvalueW[i]<alpha1){j1<-i;j2<-i}
#COHN }
#COHN if( (pvalueW[1]<alpha10) & (j2==0)){j2<-1}
#COHN if(j2==n2){
#COHN if(n2<length(Q)-5) { # set a limit of at least 5 retained observations
#COHN print(paste(" Number of low outliers equals or exceeds limit of ",n2));
#COHN return(MGBT(Q,alpha1=0.01,alpha10=0.10,n2=length(Q)-5))} # try this
#COHN else
#COHN print("MGBT identifies too many low outliers; use caution and judgment")
#COHN }
#COHN return(list(klow=j2,pvalues=pvalueW,LOThresh=ifelse(j2>0,sort(Q)[j2+1],0)))
#COHN }
# In MGBTcohn2013, TAC uses recursion back to that same function. The MGBTnb()
# recurses to MGBTcohn2013 and not back to itself, which appears the only real
# difference between the two.
"MGBTnb" <-
function(x, alphaout=0.005, alphazeroin=0.10, n2=floor(length(x)/2),
napv.zero=TRUE, offset=0) {
x <- x[! is.na(x)]
if(length(unique(x)) == 1) { # WHA error trapping
return(list(pvalues=NA, klow=0, LOThresh=0)) # WHA error trapping
} # WHA error trapping
zt <- sort( log10(pmax(1e-8,x)) )
n <- length(zt)
pvalueW <- w <- rep(NA, n2) # rep(-99,n2)
j1 <- j2 <- 0
for(i in 1:n2) {
w[i] <- (zt[i]-mean(zt[(i+1):n]))/sqrt(var(zt[(i+1):n]))
pvalueW[i] <- RthOrderPValueOrthoT(n,i,w[i])$value
if( is.na(pvalueW[i])) { # WHA error trapping
pvalueW[i] <- ifelse(napv.zero, 0, NA) # WHA error trapping
if(is.na(pvalueW[i])) next # WHA error trapping
} # WHA error trapping
if(pvalueW[i] < alphaout) { j1 <- i; j2 <- i }
}
if( pvalueW[1] < alphazeroin & j2 == 0 ) j2 <- 1 # Notice this is outside the loop and
# also is slightly different logic than seen in MGBTcohn2013 and MGBTcohn2016.
# It is curious that TAC does two length(x) in logic below, but that length is already known.
if(j2 == n2) {
if(n2 < n-5) { # set a limit of at least 5 retained observations
message(" Number of low outliers equals or exceeds limit of ",n2)
return(MGBTcohn2013(x, alphaout=0.01, alphazeroin=0.10, n2=n-5),
napv.zero=napv.zero) # recursion
} else {
message("MGBT identifies too many low outliers; use caution and judgment")
}
}
zz <- list(pvalues=pvalueW, klow=j2,
LOThresh=ifelse(j2 > 0, sort(x)[j2+1]+offset, 0))# matches MGBTcohn2013
zz$LOThresh[zz$LOThresh < 0] <- 0 # WHA 02/04/2019
return(zz) # the offset is by WHA
}
# This is a slower function in general practice between some omegas (w) get
# doubly computed. This function was developed by WHA explicitly following
# verbatim the language inside the B17C publication:
# https://pubs.er.usgs.gov/publication/tm4B5
# This function serves as a backstop against TAC having a systemtic and
# undetected fault in his R idiom implementation of the MGBT in the various
# incarations credited to him within this source code file.
"MGBT17c.verb" <- # arguments match MGBT17C
function(x, alphaout=0.005, alphain=0, alphazeroin=0.10, n2=floor(length(x)/2),
napv.zero=TRUE, offset=0, min.obs=0) {
x <- x[! is.na(x)]
n <- length(x); idx <- c(n,n2,rep(NA, 3))
txt <- c("n", "n2", "ix_alphaout", "ix_alphain", "ix_alphazeroin")
names(idx) <- txt
if(length(unique(x)) == 1) return(list(index=idx, klow=0, LOThresh=0))
if(n < min.obs) return(list(index=idx, klow=0, LOThresh=0))
if(n2 > n) {
warning("n2 > n, which does not make not logical sense")
return(NULL)
}
small <- sqrt(.Machine$double.eps); j1 <- j2 <- 0
x <- sort(x); zt <- log10( pmax(small, x) )
for(i in seq(n2,1,by=-1)) { # NOTE LITERALLY SWEEP OUT as per 17C
w <- (zt[i]-mean(zt[(i+1):n])) / sqrt(var(zt[(i+1):n]))
pv <- RthOrderPValueOrthoT(n, i, w)$value
if(napv.zero & is.na(pv)) pv <- 0
if(is.na(pv)) next
if( pv >= alphaout) { j1 <- i; next } # effect of sweep out, NOTE GREATER THAN EQUAL
break
}
j1 <- j1 - 1 # this is critical!
for(i in seq(1,n2,by=+1)) { # NOTE LITERALLY SWEEP IN as per 17C
w <- (zt[i]-mean(zt[(i+1):n])) / sqrt(var(zt[(i+1):n]))
pv <- RthOrderPValueOrthoT(n, i, w)$value
if(napv.zero & is.na(pv)) pv <- 0 # NA handling
if(is.na(pv)) next # protection from a NA leaking into these
if((pv <= alphazeroin ) & (j2==i-1)) { j2 <- i; next } # effect of sweep in,
# NOTE LESS THAN EQUAL and NOT LESS THAN
break
}
k <- max(j1,j2) # as per B17C
if(k >= n) return(list(index=idx, klow=0, LOThresh=0))
idx <- c(n,n2,j1,NA,j2); names(idx) <- txt
zz <- list(index=idx, klow=k,
LOThresh=ifelse(k > 0, x[k+1]+offset, 0))
zz$LOThresh[zz$LOThresh < 0] <- 0 # WHA 02/04/2019
return(zz)
}
# This code base follows more along lines of WHA idioms implementing B17C in
# the spirit of TAC's coding and faster than the verbatim implementation mentioned
# above. This function is to be the user-level version of the MGBT.
"MGBT17c" <- # arguments match MGBTcohn2013 but the alphas were renamed to match 17C
function(x, alphaout=0.005, alphain=0, alphazeroin=0.10, n2=floor(length(x)/2),
napv.zero=TRUE, offset=0, min.obs=0) {
x <- x[! is.na(x)]
n <- length(x); idx <- c(n,n2,rep(NA, 3))
txt <- c("n", "n2", "ix_alphaout", "ix_alphain", "ix_alphazeroin");
if(length(unique(x)) == 1) return(list(index=idx, omegas=NA, x=NA, pvalues=NA,
klow=0, LOThresh=0, message="all data are equal"))
if(n < min.obs) return(list(index=idx, omegas=NA, x=NA, pvalues=NA,
klow=0, LOThresh=0, message="too few data"))
if(n2 > n) {
warning("n2 > n, which does not make not logical sense")
return(NULL)
}
small <- sqrt(.Machine$double.eps); j1 <- j2 <- 0
x <- sort(x); zt <- log10( pmax(small, x) )
# 17C says sweeps out and in and STOP each time, which is the effect we seek, but for
# simplicity motivated by desire to return p-values for all 1:n2 values we must
# expend CPU time computing all omegas and pvalues. This was TAC motivation (at least
# concluded) for a single loop. Below are R idioms by WHA.
w <- sapply(1:n2, function(i) (zt[i]-mean(zt[(i+1):n])) /
sqrt(var(zt[(i+1):n])) )
pv <- sapply(1:n2, function(i) RthOrderPValueOrthoT(n, i, w[i])$value) # p-values
sw <- mgbt.sweeper(pv, alphaout=alphaout, alphain=alphain, alphazeroin=alphazeroin)
k <- sw$index
# don't know of this is next line is legitimately accessible but trap anyway
if(k >= n) return(list(index=idx, omegas=NA, x=NA, pvalues=NA, klow=0, LOThresh=0))
idx <- c(n,n2,sw$sweeps[1],sw$sweeps[2],sw$sweeps[3]); names(idx) <- txt
zz <- list(index=idx, omegas=w[1:n2], x=x[1:n2], pvalues=pv, klow=k,
LOThresh=ifelse(k > 0, x[k+1]+offset, 0))
zz$LOThresh[zz$LOThresh < 0] <- 0 # WHA 02/04/2019
return(zz)
}
"mgbt.sweeper" <-
function(pvs, alphaout=0.005, alphain=0, alphazeroin=0.1) {
j1 <- j2 <- j3 <- 0 # outward, inward from 1+ outward, inward sweep
n2 <- length(pvs) # the js are numer of low outliers and not the index of the threshold
if(any(is.na(pvs))) stop("NA p-value, wrapper code should make decision")
for(i in seq(n2,1,-1)) if(pvs[i] < alphaout) { j1 <- i; break }
if(j1 < n2) { # Late addition station 08020500 causes j1 to be n2 [A WHA fix to TAC.]
# with Q=c(19500, 17400, 9350, 3760, 8410, 15200, 5370, 22300, 21500, 6590, 17300)
for(i in j1+1:n2) if(pvs[i] >= alphain) { j2 <- i-1; break }
} else if(j1 == n2) {
# This means that the last pv is < alphaout
} else {
stop("should not be here in logic")
}
for(i in 1:n2) if(pvs[i] >= alphazeroin) { j3 <- i-1; break }
js <- c(j1,j2,j3); names(js) <- c("ix_alphaout", "ix_alphain", "ix_alphazeroin")
return(list(index=max(js), sweeps=js))
}
# outward sweep number of low outliers
# inward sweep number of low outliers
# other inward sweep number of low outliers
# Peer Review Notes: Mauli Chandra, North Carolina State University (07/23/2019)
# I tested the package for some of the sites using MGBTcohn2016 and MGBTnb
# functions. How are these two functions different, as they are found to be
# giving slightly different results in few cases?
# WHA (07/23/2019): Mauli, note that my comments in MGBT.R indicate that MGBTnb
# has some conditional
# if( pvalueW[1] < alphazeroin & j2 == 0 ) j2 <- 1
# that resides outside a loop compared to MGBTcohn2016 (or MGBTcohn2013). The
# MGBTnb() uses sometype of recursion. I do not understand what TAC meant to
# do with MGBTnb() or what type of experiment is was pursuing. Please note that
# MGBTcohn2016/2013 and MGBTnb() are retained for historical reference and are
# not intended for deployment. As to differences that you have seen in the
# performance between the two, this is related to subtle differences at the time
# in TAC's thoughts. The differences are restricted to whether the
# X_k or X_(k+1) order statistics are chosen as the result of the MGBT.
# Finally, by design the "official" implementation of this package is
# "MGBT" <- function(...) MGBT17c(...) # which is the first line of this file.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.