R/p2.htest.R
In CITAN: CITation ANalysis toolpack

## This file is part of the CITAN library.
##
## Copyright 2011 Marek Gagolewski
##
##
## CITAN is free software: you can redistribute it and/or modify
## it under the terms of the GNU Lesser General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## CITAN is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
## GNU Lesser General Public License for more details.
##
## You should have received a copy of the GNU Lesser General Public License
## along with CITAN. If not, see <http://www.gnu.org/licenses/>.


# #' /internal-generalized version/
# .htest.getpowerupper <- function(n, wyg, cdf, PARAM_X, PARAM_Y, ...)
# {
# 	x <- (0:n);
#
# 	stopifnot(length(wyg)==n+1);
#
# 	POW <- numeric(length(PARAM_X));
# 	for (i in 1:length(PARAM_X))
# 	{
# 		POW[i] <- 1-sum(dhirsch(x,n,cdf,PARAM_X[i], ...)*phirsch(x+wyg+0.25,n,cdf,PARAM_Y[i], ...));
#
# 	}
#
# 	return(POW);
# }

# #' /internal-generalized version/
# .htest.acceptreg.improve <- function(n, cdf, wyg, j0, j1, alpha, verbose, PARAM, ...)
# {
# 	epsbound <- 1e-7;
# 	wn <- n+1;
#
# 	for (i in j0:j1)
# 	{
# 		lbound <- ifelse(i>1 && i < wn, min(wyg[i-1], wyg[i+1]), 0)
# 		if (wyg[i] > lbound)
# 		{
# 			repeat
# 			{
# 				wyg[i] <- wyg[i]-1L;
# 				POW <- .htest.getpowerupper(n, wyg, cdf, PARAM, PARAM, ...);
# 				if (max(POW)>alpha)
# 				{
# 					wyg[i] <- wyg[i]+1L;
# 					break;
# 				}
# 				if (wyg[i] == lbound) break;
# 			}
#
# 			if (verbose)
# 			{
# 				POW <- .htest.getpowerupper(n, wyg, cdf, PARAM, PARAM, ...);
# 				cat(sprintf("%3.0f%% complete in current iteration, q=%.4f.\r", abs((i-j0)/(j1-j0))*100, mean(POW)));
# 			}
# 		}
# 	}
#
# 	if (verbose)
# 	{
# 		POW <- .htest.getpowerupper(n, wyg, cdf, PARAM, PARAM, ...);
# 		cat(sprintf("%3.0f%% complete in current iteration, q=%.4f.\r", 100, mean(POW)));
# 	}
#
# 	return(wyg);
# }



#' /internal/
.pareto2.htest.getpowerupper <- function(n, wyg, K_X, K_Y, s)
{
	x <- (0:n);

	stopifnot(length(wyg)==n+1);

	POW <- numeric(length(K_X));
	for (i in 1:length(K_X))
	{
		POW[i] <- 1-sum(pareto2.dhirsch(x,n,K_X[i],s)*pareto2.phirsch(x+wyg+0.25,n,K_Y[i],s));
	}

	return(POW);
}






#' /internal/
.pareto2.htest.getsize_optimized <- function(n, x, wyg, K, s, dhirp2mat)
{
	POW <- numeric(length(K));
	for (i in 1:length(K))
	{
		POW[i] <- 1-sum(dhirp2mat[i,]*pareto2.phirsch(x+wyg+0.25,n,K[i], s));
	}

	return(POW);
}


#' /internal/
.pareto2.htest.acceptreg.improve2 <- function(n, wyg, alpha, verbose, K, s)
{
	epsbound <- 1e-7;
	wn <- n+1;


	# prepare input data for .pareto2.htest.getsize_optimized
	# this significanlty speeds up computations
	xhir0n <- (0:n);
	dhirp2mat <- matrix(nrow=length(K), ncol=length(xhir0n));
	for (i in 1:length(K))
		dhirp2mat[i,] <- pareto2.dhirsch(xhir0n,n,K[i],s);
	# -------------------------------------------------------

	stopifnot(length(wyg)==wn);

	v <- max(wyg);
	wyg <- rep(v,wn);
	stopifnot(max(.pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat)) <= alpha);
	stopifnot(max(.pareto2.htest.getsize_optimized(n, xhir0n, wyg-1, K, s, dhirp2mat)) > alpha);

	k1 <- wn-1;
	while (max(.pareto2.htest.getsize_optimized(n, xhir0n, c(rep(v-1,k1), rep(v,wn-k1)), K, s, dhirp2mat))>=alpha)
		k1 <- k1-1;

	k1 <- k1+1;
	k0 <- k1-1;
	maxdiff <- 2;
# 	while (max(.pareto2.htest.getsize_optimized(n, xhir0n, c(rep(v,k0), rep(v-1,wn-k0)), K, s, dhirp2mat))>=alpha)
# 		k0 <- k0+1;

	cat(sprintf("wn=%g; k0=%g; k1=%g; maxdiff=%g\n", wn, k0, k1, maxdiff));
	stopifnot(k0<k1);


	sizemax <- 0;
	qualmax <- 10;
	wygmax <- wyg;

	imprrec <- function(wyg, k0, k1)
	{
		if (k0 <= 0 && k1 > wn) return();

# 		if (k0 <= 0) k0 <- 1;
# 		if (k1 > wn) k1 <- wn;
		v0 <- ifelse(k0<=0, wyg[1], wyg[k0]);
		v1 <- ifelse(k1>wn, wyg[wn], wyg[k1]);

		j0 <- 0;
		while(T)
		{
			if (k0 >= 1) wyg [1:k0] <- v0-j0;

			j1 <- 0;
			while (T)
			{
				if (k1 <= wn) wyg[k1:wn] <- v1-j1;

				POW <- .pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat);
				mp <- max(POW);

				if (mp <= alpha)
				{
					mep <- mean((POW-alpha)^2);

					if (mep<=qualmax)
					{
						wygmax <<- wyg;
						qualmax <<- mep;
						sizemax <<- mp;
						cat(sprintf("size=%.5f and q=%.5f. ", sizemax, qualmax));
						print(wyg);
					}

					imprrec(wyg, k0-1, k1+1);
				} else {
					j1 <- maxdiff;
				}

				j1 <- j1 + 1;
				if (v1-j1 < 0 || j1 > maxdiff || k1 > wn) break;
			}

			j0 <- j0 + 1;
			if (v0-j0 < 0 || j0 > maxdiff || k0 < 1) break;
		}
	}

	imprrec(wygmax, k0, k1);
# 	imprrec(wygmax, 0, k1);
# 	imprrec(wygmax, k0, wn+1);

	wyg <- wygmax;

# 	for (i in j0:j1)
# 	{
# 		lbound <- ifelse(i>1 && i < wn, min(wyg[i-1], wyg[i+1]), 0)
# 		if (wyg[i] > lbound)
# 		{
# 			repeat
# 			{
# 				wyg[i] <- wyg[i]-1L;
# 				POW <- .pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat);
# 				if (max(POW)>alpha)
# 				{
# 					wyg[i] <- wyg[i]+1L;
# 					break;
# 				}
# 				if (wyg[i] == lbound) break;
# 			}
#
# 			if (verbose)
# 			{
# 				POW <- .pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat);
# 				cat(sprintf("%3.0f%% complete in current iteration, q=%.4f.\r", abs((i-j0)/(j1-j0))*100, mean(POW)));
# 			}
# 		}
# 	}

	if (verbose)
	{
		POW <- .pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat);
		cat(sprintf("%3.0f%% complete in current iteration, q=%.4f.\r", 100, mean(POW)));
	}

	return(wyg);
}



#' /internal/
.pareto2.htest.acceptreg.improve <- function(n, wyg, j0, j1, alpha, verbose, K, s)
{
	epsbound <- 1e-7;
	wn <- n+1;


	# prepare input data for .pareto2.htest.getsize_optimized
	# this significanlty speeds up computations
	xhir0n <- (0:n);
	dhirp2mat <- matrix(nrow=length(K), ncol=length(xhir0n));
	for (i in 1:length(K))
		dhirp2mat[i,] <- pareto2.dhirsch(xhir0n,n,K[i],s);
	# -------------------------------------------------------



	for (i in j0:j1)
	{
		lbound <- ifelse(i>1 && i < wn, min(wyg[i-1], wyg[i+1]), 0)
		if (wyg[i] > lbound)
		{
			repeat
			{
				wyg[i] <- wyg[i]-1L;
				POW <- .pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat);
				if (max(POW)>alpha)
				{
					wyg[i] <- wyg[i]+1L;
					break;
				}
				if (wyg[i] == lbound) break;
			}

			if (verbose)
			{
				POW <- .pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat);
				cat(sprintf("%3.0f%% complete in current iteration, q=%.4f.\r", abs((i-j0)/(j1-j0))*100, mean(POW)));
			}
		}
	}

	if (verbose)
	{
		POW <- .pareto2.htest.getsize_optimized(n, xhir0n, wyg, K, s, dhirp2mat);
		cat(sprintf("%3.0f%% complete in current iteration, q=%.4f.\r", 100, mean(POW)));
	}

	return(wyg);
}


#' /internal/
.pareto2.htest.getK <- function(n, drho, s)
{
	# control function:
	kappa    <- function(x) { pmax(0,pmin(1,x))*n; }

	# kappa-indices (rho) for which to determine the power function
	RHO <- c(1e-3,seq(1e-3+drho, 1-drho-1e-3, drho), 1-1e-3);
	nrho <- length(RHO);

	stopifnot(RHO[1]<RHO[length(RHO)] && RHO[1]<RHO[2]);

	K   <- numeric(nrho);
	for (i in 1:nrho)
	{
		K[i] <- uniroot(function(k,s,targetrho,kappa)
			{
				1-ppareto2(kappa(targetrho),k,s)-targetrho;
			}, c(1e-15,ifelse(i==1,1e15,K[i-1])), # note that we assume RHO is sorted increasingly-that's much faster
			   s, RHO[i], kappa, tol=1e-20)$root;
	}

	return(K);
}



#' Performs \eqn{h}-test for equality of shape parameters
#' of two samples from the Pareto type-II distributions with known
#' and equal scale parameters, \eqn{s>0}.
#'
#' Given two equal-sized samples \eqn{X=(X_1,...,X_n)} i.i.d. \eqn{P2(k_x,s)}
#' and \eqn{Y=(Y_1,...,Y_m)} i.i.d. \eqn{P2(k_y,s)}
#' this test verifies the null hypothesis
#' \eqn{H_0: k_x=k_y}
#' against two-sided or one-sided alternatives, depending
#' on the value of \code{alternative}.
#' It bases on test statistic
#' \code{T=H(Y)-H(X)}
#' where \eqn{H} denotes Hirsch's \eqn{h}-index (see \code{\link{index.h}}).
#'
#' Note that for \eqn{k_x < k_y}, then \eqn{X} dominates \eqn{Y} stochastically.
#'
#' @title Two-sample h-test for equality of shape parameters for Type II-Pareto distributions with known common scale parameter
#' @param x an n-element non-negative numeric vector of data values.
#' @param y an n-element non-negative numeric vector of data values.
#' @param s scale parameter, \eqn{s>0}.
#' @param alternative indicates the alternative hypothesis and must be one of "two.sided" (default), "less", or "greater".
#' @param significance significance level. See Value for details.
#' @param wyg precomputed h-dependent acceptation region or \code{NULL}. See Value for details.
#' @param verbose logical; if \code{TRUE} then the computation progress will be printed out.
#' @param drho power calculation accuracy, a single number in [0.001, 0.1]. The smaller the value the slower computation, but more precise. This is used to determine \code{K} iff \code{K} is not given.
#' @param K numeric vector; shape parameters for which to calculate the power function or \code{NULL}.
#' @param improve logical; if \code{TRUE} then the greedy heuristic algorithm for improving the acceptation region will be run.
#' @return
#' The list of class \code{power.htest} with the following components is passed as a result:
#' \tabular{ll}{
#' \code{statistic} \tab	the value of the test statistic.\cr
#' \code{result} \tab	either FALSE (accept null hypothesis) or TRUE (reject).\cr
#' \code{alternative} \tab	a character string describing the alternative hypothesis.\cr
#' \code{method} \tab	a character string indicating what type of test was performed.\cr
#' \code{data.name} \tab	a character string giving the name(s) of the data.\cr
#' \code{wyg} \tab	a numeric vector giving the h-dependent acceptation region used.\cr
#' \code{size} \tab	size of the test corresponding to \code{wyg}.\cr
#' \code{qual} \tab	quality of the test corresponding to \code{wyg}, the closer to \code{significance}, the better.\cr
#' }
#' Currently no method for determining the p-value of this test is implemented.
#' @export
#' @seealso \code{\link{dpareto2}}, \code{\link{pareto2.goftest}}, \code{\link{pareto2.ftest}}, \code{\link{pareto2.htest.approx}}, \code{\link{index.h}}
#' @references
#' Gagolewski M., Grzegorzewski P., S-Statistics and Their Basic Properties, In: Borgelt C. et al (Eds.),
#' Combining Soft Computing and Statistical Methods in Data Analysis, Springer-Verlag, 2010, 281-288.\cr
pareto2.htest <- function(x, y, s, alternative = c("two.sided", "less", "greater"), significance=0.05, wyg=NULL, verbose=TRUE, drho=0.005, K=NULL, improve=TRUE)
{
	if (length(significance) != 1 || significance <= 0 || significance >= 1) stop("incorrect significance level");

	if (significance > 0.2) warning("'significance' is possibly incorrect");

	alternative <- match.arg(alternative);
	DNAME <- deparse(substitute(x));
	DNAME <- paste(DNAME, "and", deparse(substitute(y)));

	if (mode(s) != "numeric" || length(s) != 1 || s <= 0) stop("'s' should be > 0");

	if (length(drho) != 1 || drho < 0.001 || drho > 0.1)
		stop("drho should be a single numeric value in [0.000001, 0.1]");

	if (!is.null(K) && (mode(K) != "numeric" || length(K) < 10 || any(K<=0) || any(is.infinite(K))))
		stop("incorrect 'K'");

	if (mode(x) != "numeric" || mode(y) != "numeric") stop("non-numeric data given");

	x <- x[!is.na(x)];
	n <- length(x);
	if (n < 1L || any(x<0)) stop("incorrect 'x' data");

	y <- y[!is.na(y)];
	if (length(y) < 1L || any(y<0)) stop("incorrect 'y' data");

	if (length(y) != n) stop("non-equal-sized vectors given on input");

# 	if (n > 50 && is.null(wyg)) warning("n is large - Do you know what you're doing? It's damn slow! :-)");


	HY <- index.h(y,disable.check=TRUE);
	HX <- index.h(x,disable.check=TRUE);
	STATISTIC <- HY-HX;
	names(STATISTIC) <- "H";

	METHOD <- "Two-sample h-test for equality of shape parameters for Type II-Pareto distributions with known common scale parameter";

	nm_alternative <- switch(alternative, two.sided = "two-sided",
			less = "kx < ky",
			greater = "kx > ky");

	if (alternative == "two.sided") {
		powerscale <- 2;
	} else {
		powerscale <- 1;
	}
	alpha <- significance/powerscale;
	size <- NA;
	qual <- NA;

	wn <- n+1;     # number of possible h-index values

	# -----------------------------------------------------------------------

	if (is.null(wyg))
	{
		if (is.null(K))
		{
			# find scale parameters corresponding to kappa-indices
			if (verbose) cat(sprintf("Determining 'K' for 'drho'=%g...\n", drho));
			K <- .pareto2.htest.getK(n, drho, s)
		}


		# determine bound of the acceptation region that is independent on the value
		# of the h-indices (constant 'wyg')
		if (verbose) cat(sprintf("Determining h-independent bound of the acceptation region...\n"));

		v <- 0L; # initial solution
		wyg <- rep(v,wn) # a constant function of observed h-index
		POW <- .pareto2.htest.getpowerupper(n, wyg, K, K, s);

		while (max(POW) > alpha)
		{
			v <- v+1L;
			if (v > n) stop("h-independent bound could not be found");
			wyg <- rep(v,wn) # a constant function of observed h-index
			POW <- .pareto2.htest.getpowerupper(n, wyg, K, K, s);
		}

		if (verbose)
		{
			size <- max(POW)*powerscale;
			qual <- mean(POW)*powerscale;
			cat(sprintf("OK, v=%d for n=%d. This gives test size=%f and qual=%f.\n",
				v, n, size, qual));
		}
	} else { # wyg is not null

		if (length(wyg) != n+1 || mode(wyg) != "numeric" || any(wyg<0) || any(wyg>n))
			stop("incorrect 'wyg' for this test case");

		if (verbose || improve) # check whether given 'wyg' guarantees desired significance level
		{
			if (is.null(K))
			{
				# find scale parameters corresponding to kappa-indices
				if (verbose) cat(sprintf("Determining 'K' for 'drho'=%g...\n", drho));
				K <- .pareto2.htest.getK(n, drho, s);
			}

			POW <- .pareto2.htest.getpowerupper(n, wyg, K, K, s);
			if (verbose)
			{
				size <- max(POW)*powerscale;
				qual <- mean(POW)*powerscale;
				cat(sprintf("Given test size=%f and qual=%f for n=%d.\n",
					size, qual, n));
			}

			if (max(POW) > alpha) stop("Given 'wyg' does not guarantee desired significance level");
		}
	}


	# improve the acceptation region
	if (improve)
	{
		if (verbose) cat(sprintf("Improving h-dependent bounds of the acceptation region...\n"));

# 				wyg <- .htest.acceptregimprove(n, ppareto2, wyg, 1, wn,  alpha, verbose, K, s);
# 				wyg <- .htest.acceptregimprove(n, ppareto2, wyg, wn, 1,  alpha, verbose, K, s);
# 		wyg <- .pareto2.htest.acceptreg.improve(n, wyg, wn, 1, alpha, verbose, K, s)
# 		wyg <- .pareto2.htest.acceptreg.improve(n, wyg, 1, wn, alpha, verbose, K, s)
		wyg <- .pareto2.htest.acceptreg.improve2(n, wyg, alpha, verbose, K, s);

		if (verbose)
		{
			POW <- .pareto2.htest.getpowerupper(n, wyg, K, K, s);
			size <- max(POW)*powerscale;
			qual <- mean(POW)*powerscale;
			cat(sprintf("This now gives test size=%f and qual=%f.\n",
				size, qual));
			stopifnot(max(POW) <= alpha);
		}
	}


	# -----------------------------------------------------------------------

	if (alternative == "two.sided") {
		if (HX<HY)
		{
			RESULT <- (abs(STATISTIC)>wyg[HX+1]);
		} else {
			RESULT <- (abs(STATISTIC)>wyg[HY+1]);
		}
	} else if (alternative == "less") {
		if (HX<HY)
		{
			RESULT <- FALSE;
		} else {
			RESULT <- (abs(STATISTIC)>wyg[HY+1]);
		}
	} else {
		if (HX>HY)
		{
			RESULT <- FALSE;
		} else {
			RESULT <- (abs(STATISTIC)>wyg[HX+1]);
		}
	}

	RVAL <- list(statistic = STATISTIC, result = RESULT, alternative = nm_alternative,
		method = METHOD, data.name = DNAME, wyg = wyg, size = size, qual = qual);
	class(RVAL) <- "power.htest";
	return(RVAL);
}