R/univ.R

Defines functions al logit shorth tukeyor tukeyorinds box.scale rououtinds rouor shorth.scale hampoutinds hampor hamp.scale.4 hamp.scale.3 gesdri skesd skesd.old lamtab calout.detect

Documented in box.scale calout.detect hampor hamp.scale.3 hamp.scale.4 rouor shorth shorth.scale tukeyor

calout.detect <- function( x, alpha=0.05,
                method = c("GESD", "boxplot", "medmad", "shorth", "hybrid"), 
                             k = ((length(x) %% 2) * floor(length(x)/2) +
		                   (1 - (length(x) %% 2)) * (length(x)/2 - 1)), 
		                   scaling,
		                   ftype,
		     		location,
		     		scale,
		                     gen.region = function(x,location,scale,scaling,alpha)
		     			{
		     			g <- scaling(length(x),alpha)
		     			location(x)+c(-1,1)*g*scale(x)
		     			}
 )
	{
	if (length(method) > 1) {
		warning("GESD detection selected by default")
		method <- method[1]
		}
	if (method == "hybrid" & ( missing(scaling) | 
	                missing(location) | missing(scale) ) )
		stop("hybrid detection requires scaling, location and args")
	if (method != "GESD" & missing(scaling)) 
		{
		#warning("default scaling function assigned")
		if (method == "shorth") scaling <- shorth.scale
		else if (method == "medmad") scaling <- hamp.scale.4
		else if (method == "boxplot") scaling <- box.scale
		else if (method == "hybrid") 
			{
			scaling <- scaling
			location <- location
			scale <- scale
			}
		}
	n <- length(x)
	if ( method == "GESD" )
		ans <- gesdri( x=x, k=k, alpha=alpha )
	else if ( method == "boxplot" )
		{
	        g <- scaling( n, alpha )
		ans <- tukeyorinds( x=x, alpha=alpha, g=g )
		}
	else if ( method == "medmad" )
		{
	        g <- scaling( n, alpha )
		ans <- hampoutinds( x=x, alpha=alpha, g=g )
		}
	else if ( method == "shorth" )
		{
	        g <- scaling( n, alpha )
		ans <- rououtinds( x=x, alpha=alpha, g=g )
		}
	else if ( method == "hybrid" )
		{
		OR <- gen.region(x,location,scale,scaling,alpha)
		outinds <- c((1:length(x))[x<OR[1]],(1:length(x))[x>OR[2]])
		if (is.null(outinds)) ans <- list(ind=NA,val=NA,outlier.region=OR)
		else ans<- list(ind=outinds,val=x[outinds],outlier.region=OR)
		}
		

		ans
		
	}


#ckesd <-
#function(x, k)
#{
#	n <- as.integer(length(x))
#	x <- as.double(x)
#	k <- as.integer(k)
#	resvec <- as.double(rep(0, k))
#	q <- as.integer(rep(0, n))
#	indvec <- as.integer(rep(0, k))
#	z <- .C("ckesd",
#		x = x,
#		n = n,
#		k = k,
#		resvec = resvec,
#		q = q,
#		indvec = indvec,
#		NAOK = TRUE,
#		specialsok = TRUE,
#		pointers = NULL)
#	z
#}

lamtab <-
function(n, k, alpha = 0.05)
{
# obtain the sequence of k critical values
# to be used in checking for k outliers
	l <- 0:(k - 1)
	p <- 1 - ((alpha/2)/(n - l))
# note, this can trip a bug in Splus 3.2 qt if n is very large (>50000?)
# the invibeta routine needs to be iterated some more
	tvals <- qt(df = n - l - 2, p = p)
	lams <- ((n - l - 1) * tvals)/sqrt((n - l - 2 + tvals^2) * (n - l))
	lams
}


skesd.old <- function(x, k)
{
        bigres <- rep(NA, k)
        bigresind <- rep(NA, k)
        n <- length(x)
        curx <- x
        ind <- 1:n
        inds <- NULL
        for(i in 1:k) {
                last <- n - i + 1
                m <- mean(curx)
                ares <- abs(curx - m)
                oares <- order(ares)
                bigres[i] <- ares[oares[last]]/sqrt(var(curx))
                curx <- curx[ - oares[last]]
                inds <- c(inds, ind[oares[last]])
                ind <- ind[ - oares[last]]
        }
        list(res = bigres, ind = inds)
}

skesd <- function(x, k)
{
# thanks to Jerry Lewis of Biogen Idec
        n <- length(x)
        ind <- order(x)  # only sort once, outside the loop
        curx <- x[ind]
        inds <- bigres <- NULL
        for(last in n:(n-k+1)) {
                m <- mean(curx)
                ares <- abs(curx[c(1,last)] - m)
                if(ares[1]==ares[2]) {  # handle ties on opposite sides of m deliberately -- test the one farthest from median
                    i <- ifelse(median(curx)>m,1,last)
                } else {
                    i <- ifelse(ares[1]>ares[2],1,last)
                }
                bigres <- c(bigres, ares[ifelse(i==1,1,2)]/sqrt(var(curx)))
                curx <- curx[-i]
                inds <- c(inds, ind[i])
                ind <- ind[-i]
        }
        list(res = bigres, ind = inds)
}


gesdri <-
function(x, k = ((length(x) %% 2) * floor(length(x)/2) +
	      (1 - (length(x) %% 2)) * (length(x)/2 - 1)), alpha = 0.05)
{
#        if (is.loaded("ckesd"))
#             E <- ckesd(x, k)
#        else
        E <- skesd(x, k)
	R <- E$res
	I <- E$ind
	n <- length(x)	#if(n == 100 & k == 49)
	L <- lamtab(length(x), k, alpha = alpha)
	worst <- NA
	if (any(R>L)) worst <- max((1:k)[R > L])
	if(!is.finite(worst) || is.na(worst))
		list(ind = NA, val = NA)
	else list(ind = I[1:worst], val = x[I[1:worst]])
}

hamp.scale.3 <-
function(n, alpha)
{
# formula (15) p 790, alpha=.05
if ( n < 10 ) warning("procedure not calibrated for n < 10")
if (alpha == 0.05)
	if(n %% 2) 2.906 + 12.99 * (n - 5)^(-0.5781) else 2.906 + 11.99 * (n - 
			6)^(-0.5651)
else if (alpha == 0.01)
	if(n %% 2) 3.819 + 26.50 * (n - 7)^(-0.6089) else 3.819 + 24.09 * (n - 
			7)^(-0.5936)
else stop("procedure only calibrated for alpha=0.01, 0.05")
}

hamp.scale.4 <-
function(n, alpha)
{
if ( n < 10 ) warning("procedure not calibrated for n < 10")
# formula (16)
if (alpha == 0.05)
	if(n %% 2) 1.483 * qnorm(0.975^(1/n)) + 14.43 * ((n - 3)^(-0.7939))
		 else 1.483 * qnorm(0.975^(1/n)) + 21.61 * ((n + 1)^(-0.8655))
else if (alpha == 0.01)
	if(n %% 2) 1.483 * qnorm(0.995^(1/n)) + 24.48 * ((n - 5)^(-0.8236))
		 else 1.483 * qnorm(0.995^(1/n)) + 41.39 * ((n)^(-0.9143))
else stop("procedure only calibrated for alpha=0.01, 0.05")
}

hampor <-
function(x, g)
{
	lx <- length(x)
	Mad <- function(x)
	median(abs(x - median(x)	# Mad is the resistant scale measure
	))
	m <- median(x)
	s <- Mad(x)
	L <- m - s * g
	U <- m + s * g
	c(L, U)
}

hampoutinds <-
function(x, alpha, g=hamp.scale.4(length(x),alpha=alpha) )
{
	OR <- hampor(x, g)
	N <- length(x)
        outinds <- c((1:N)[x < OR[1]], (1:N)[x > OR[2]])
        if (sum(x<OR[1])==0 & sum(x>OR[2])==0) {
           warning("no data values in outlier region")  
           list(ind= NA, val= NA, outlier.region=OR) } else 
             list(ind= outinds, val= x[outinds], outlier.region=OR)
}


shorth.scale <- function(n, alpha=0.05)
{
# this function gives scaling values for the raw
# length of the shorth to be used in outlier detection.
        if(n < 10 | n > 2000) warning(
                        "rousseeuw calibration based on 10 <= n <= 2000")
        if(alpha == 0.05) {
                LN <- c(1, log(n)^(1:4))
                ecoef <- c(9.165430, -3.632088,
                        0.7811141, -0.07331514,
                        0.002603013)

                ocoef <- c(20.87936, -12.84945274,
                        3.50602884, -.42863233,
			0.01975694)
                if(n %% 2 == 0 | n >= 400)
                        sum(ecoef * LN)
                else sum(ocoef * LN)
        }
        else if(alpha == 0.01) {
                LN <- c(1, log(n)^(1:4))
                coe <- c(20.0988614238686, -9.7688241263516, 2.10296418210252, 
                        -0.198414242035932
                        , 0.00696260655978963)
                sum(coe * LN)
        }
        else stop("Rousseeuw scaling only defined for alpha = .05 or .01")
}


rouor <-
function(x, alpha=0.05, g=shorth.scale(length(x),alpha=alpha),frac=0.5)
{
	if (frac > .5 | frac < .5) stop("rousseeuw detection not defined for frac != .5")
	shss <- shorth(x,Alpha=frac)
# in a former incarnation, this outlier region was
# made unbiased.  but the complications are heavy
# and bias corrections are not exact.  so instead
# we now base the outlier region on lshorth which
# is basically unambiguously computable
	shss$mid - shss$len * g * c(1, -1) 
}

rououtinds <-
function(x, alpha=0.05, g=shorth.scale(length(x),alpha=alpha),frac=0.5)
{
	OR <- rouor(x, alpha, g, frac=0.5)
	N <- length(x)
        outinds <- c((1:N)[x < OR[1]], (1:N)[x > OR[2]])
        if (sum(x<OR[1])==0 & sum(x>OR[2])==0) {
           warning("no data values in outlier region")  
           list(ind= NA, val= NA, outlier.region=OR) } else list(ind= outinds, val= x[outinds], outlier.region=OR)
}

box.scale <-
function(n, alpha = 0.05)
{
        if(n > 2000)
                warning("extrapolations for n>2000 may be invalid")
        if(n <= 10)
                stop("n must be greater than 10")
        if(alpha == 0.05) {
                mod <- function(x, n)
                x %% n
                lmodff <- function(x)
                1.521267 + 0.1458931 * x
                sumodf <- function(x)
                2.668116 - 0.2723008 * x + 0.04326603 * x * x - 0.0009040389 * 
                        x * x * x
                slmodf <- function(x)
                2.308486 - 0.08364704 * x + 0.01044393 * x * x + 0.0009883642 * 
                        x * x * x
                ln <- log(n)
                if(ln > 5.7)
                        lmodff(ln)
                else if(mod(n, 4) == 0 | mod(n, 4) == 1)
                        slmodf(ln)
                else sumodf(ln)
        }
        else if(alpha == 0.01) {
                qmod <- function(n)
                7.085162 - 2.692155 * n + 0.5950019 * n * n - 0.05763641 * n * 
                        n * n + 0.002165973 * n * n * n * n
                qmod(log(n))
        }
	else stop("procedure defined only for alpha = .01 or .05")
}


tukeyorinds <-
function(x, alpha = 0.05, g = box.scale(length(x), alpha=alpha))
{
	OR <- tukeyor(x,alpha, g)
	N <- length(x)
        outinds <- c((1:N)[x < OR[1]], (1:N)[x > OR[2]])
        if (sum(x<OR[1])==0 & sum(x>OR[2])==0) {
           warning("no data values in outlier region")  
           list(ind= NA, val= NA, outlier.region=OR) } else list(ind= outinds, val= x[outinds], outlier.region=OR)
}

tukeyor <-
function(x, alpha=0.05, g = box.scale(length(x), alpha=alpha), ftype = "ideal")
{
	n <- length(x)
	if(ftype == "ideal") # see Hoaglin, Iglewicz, JASA 1987
		f <- n/4 + 5/12
	else f <- 0.5 * floor((n + 3)/2)
	xs <- sort(x)
	ff <- floor(f)
	cf <- ceiling(f)
	fracinterp <- function(x, f) # suppose 1 < f < length(x)
		# this linearly interpolates between x[floor(f)] and x[ceiling(f)]
		# typical use requires x sorted
	{
		ind <- floor(f)
		base <- x[ind]
		top <- x[ind + 1]
		frac <- f - ind
		base + (top - base) * frac
	}
	Fl <- fracinterp(xs, f)
	Fu <- fracinterp(xs, n + 1 - f)
	Fspread <- Fu - Fl
	kf <- g * Fspread
	IFl <- Fl - kf
	IFu <- Fu + kf
	c(IFl, IFu)
}

shorth <- function(x, Alpha = 0.5)
{
	x <- sort(x)
	N <- length(x)
	nsamp <- floor(N * Alpha) + 1	# rouss/ler 1988 p104
	begs <- 1:(N - nsamp + 1)
	ends <- begs + nsamp - 1
	lens <- x[ends] - x[begs]
	Nlens <- length(lens)
	leasts <- (1:Nlens)[lens <= min(lens)]
	if(length(leasts) > 1)
		warning("shorth is not unique, leftmost used")
	least <- leasts[1]
	intv <- c(x[begs][least], x[ends][least])	
	# correction factor to eliminate the parity dependence 
# of shs, Rousseeuw and Leroy
# 1988 Stat Neer p115
	te <- 3/8
	ao2 <- Alpha/2	#	if((N %% 4) == 0) {
	q <- floor((0.5 + ao2) * N)
	k <- ceiling((0.5 - ao2) * N)
	if((N %% 4) == 0) {
		q <- q - 0.5
		k <- k + 0.5
	}
	else if((N %% 4) == 3) {
		q <- q - 0.5
		k <- k + 0.5
	}
	corr <- 1/((qnorm((q - te)/(N - 2 * te + 1))) - (qnorm((k - te)/(N - 2 * 
		te + 1))))	# empirical bias correction for alpha = .5 only
	bias.corr <- 0.89 + 0.142 * log(log(log(N)))
	list(shorth = intv, length.shorth = lens[least], midpt.shorth = mean(
		intv), meanshorth = mean(x[(begs[least]):(ends[least])]), 
		correction.parity.dep = corr, bias.correction.gau.5 = bias.corr,
		alpha = Alpha)
}

        
logit <- function(x) log(x/(1-x))
al <- function(x) plogis(x)
vjcitn/parodyGHA documentation built on Dec. 23, 2021, 4:06 p.m.