Nothing
#' Power estimation for two-sample methods
#'
#' Find the power of various two sample tests using Rcpp and parallel computing.
#'
#' For details consult vignette("R2sample","R2sample")
#'
#' This routine runs a number of different two-sample tests for univariate data,
#' either discrete or continuous. The user can also provide their own test method.
#'
#' @param f function to generate a list with data sets x, y and (optional) vals, weights
#' @param ... additional arguments passed to f, up to 2
#' @param TS routine to calculate test statistics for non-chi-square tests
#' @param TSextra additional info passed to TS, if necessary
#' @param With.p.value =FALSE does user supplied routine return p values?
#' @param alpha =0.05, the level of the hypothesis test
#' @param B =1000, number of simulation runs.
#' @param nbins =c(50,10), number of bins for chi large and chi small.
#' @param minexpcount =5 minimum required count for chi square tests
#' @param UseLargeSample should p values be found via large sample theory if n,m>10000?
#' @param samplingmethod ="Binomial" or independence in discrete data case
#' @param rnull a function that generates data from a model, possibly with parameter estimation.
#' @param SuppressMessages = FALSE print informative messages?
#' @param maxProcessor maximum number of cores to use. If maxProcessor=1 no parallel computing is used.
#' @return A numeric vector of power values.
#' @export
#' @examples
#' # Power of standard normal vs. normal with mean mu.
#' f1=function(mu) list(x=rnorm(25), y=rnorm(25, mu))
#' #Power of uniform discrete distribution vs. with different probabilities.
#' twosample_power(f1, mu=c(0,2), B=100, maxProcessor = 1)
#' f2=function(n, p) list(x=table(sample(1:5, size=1000, replace=TRUE)),
#' y=table(sample(1:5, size=n, replace=TRUE,
#' prob=c(1, 1, 1, 1, p))), vals=1:5)
#' twosample_power(f2, n=c(1000, 2000), p=c(1, 1.5), B=100, maxProcessor = 1)
#' # Compare power of a new test with those in package:
#' myTS=function(x,y) {z=c(mean(x)-mean(y),sd(x)-sd(y));names(z)=c("M","S");z}
#' cbind(twosample_power(f1, mu=c(0,2), TS=myTS,B=100, maxProcessor = 1),
#' twosample_power(f1, mu=c(0,2), B=100, maxProcessor = 1))
#' # Power estimation if routine returns a p value
#' myTS2=function(x, y) {out=ks.test(x,y)$p.value; names(out)="KSp"; out}
#' twosample_power(f1, c(0,1), TS=myTS2, With.p.value = TRUE, B=100)
#'
twosample_power=function(f, ..., TS, TSextra, With.p.value=FALSE,
alpha=0.05, B=1000,
nbins=c(50,10), minexpcount=5, UseLargeSample,
samplingmethod="Binomial", rnull,
SuppressMessages=FALSE, maxProcessor) {
if(!missing(UseLargeSample) & !missing(rnull)) {
message("Large sample methods are not implemented for GOF-Twosample hybrid problem")
return(NULL)
}
# create function rxy which generates data, with two arguments
if(length(list(...))==0) { # f has 0 arguments
rxy=function(a=0, b=0) f()
avals=0
bvals=0
}
if(length(list(...))==1) { # f has 1 argument
rxy=function(a=0, b=0) f(a)
avals=list(...)[[1]]
bvals=0
}
if(length(list(...))==2) { # f has 2 arguments
rxy=function(a=0, b=0) f(a,b)
avals=list(...)[[1]]
bvals=list(...)[[2]]
}
# check that avals and bvals have the same length.
# If they do, matrix of powers is returned without row names.
# If one of them is a scalar, make it the same length as the other and use those
# values as row names
if(length(avals)!=length(bvals)) {
if(min(c(length(avals),length(bvals)))>1) {
message("lengths of parameter vectors not compatible!\n")
return(NULL)
}
if(length(avals)==1) {
avals=rep(avals, length(bvals))
rnames=bvals
}
else {
bvals=rep(bvals, length(avals))
rnames=avals
}
}
else rnames=1:length(avals)
# generate one data set as an example, do some setup
dta = rxy(avals[1], bvals[1])
if(length(dta)<2 | length(dta)>4) {
message("routine f has to create a list with 2 (continous data without weights), 3 (discrete data) or 4 (continous data with weights)")
return(NULL)
}
if( (!("x"%in%names(dta))) | (!("x"%in%names(dta))) ) {
message("routine f has to create a list with elements named x and y, vals (optional, for discrete data) , wx and wy (optional, for continuous weighted data")
return(NULL)
}
x=dta$x
y=dta$y
if("vals"%in%names(dta)) vals=dta$vals
if("wx"%in%names(dta)) wx=dta$wx
else wx=rep(1, length(x))
if("wy"%in%names(dta)) wy=dta$wy
else wy=rep(1, length(y))
if(!is.numeric(samplingmethod))
samplingmethod=ifelse(samplingmethod=="independence", 1, 2)
Continuous = ifelse("vals" %in% names(dta), FALSE, TRUE)
if(missing(TSextra)) TSextra = list(samplingmethod=samplingmethod)
else TSextra=c(TSextra, samplingmethod=samplingmethod)
if(!missing(rnull)) TSextra=c(TSextra, rnull=rnull)
CustomTS=ifelse(missing(TS), FALSE, TRUE)
WithWeights=ifelse(all(c(wx,wy)==1), FALSE, TRUE)
if(!Continuous & missing(maxProcessor)) {
maxProcessor=1
if(!SuppressMessages) message("For discrete data only a single processor is used if maxProcessor is not specified")
}
if(min(wx, wy)<0.01 | max(wx, wy)>5)
if(!SuppressMessages) message("Some of the weights are either exceptionally small or large. In either case the tests can be unreliable!")
if(missing(UseLargeSample)) {
UseLargeSample=ifelse(max(length(x), length(y))<1e4, FALSE, TRUE)
if(!Continuous) UseLargeSample=FALSE
}
if(!Continuous) vals=dta$vals
if(!CustomTS) { # do included methods
if(Continuous) {
if(!WithWeights) { # No weights
typeTS = 1
TS=TS_cont
tmp=TS(x,y)
}
else { # weighted data
typeTS = 2
TS=TSw_cont
tmp=TS(x, y, wx, wy)
doMethods=c("KS", "Kuiper", "CvM", "AD")
if(min(wx,wy)<0.01 | max(wx,wy)>5)
if(!SuppressMessages) message("Some of the weights are either exceptionally small or large. In either case the tests can be unreliable!")
}
}
else {
typeTS=5
TS=TS_disc
TSextra=c(TSextra, list(adw=weights(dta)))
tmp=TS(x, y, vals, TSextra$adw)
}
}
else { # do user-supplied tests
if(substr(deparse(TS)[2], 1, 5)==".Call") {
if(maxProcessor>1) {
if(!SuppressMessages) message("Parallel Programming is not possible if custom TS is written in C++. Switching to single processor")
maxProcessor=1
}
}
if(Continuous) {
if(length(formals(TS))==2) {
typeTS=3
tmp=TS(x, y)
}
else {
typeTS=4
tmp=TS(x, y, TSextra)
}
}
else {
if(length(formals(TS))==3) {
typeTS=6
tmp=TS(x, y, vals)
}
else {
typeTS=7
tmp=TS(x, y, vals, TSextra)
}
}
if(is.list(tmp)) {
message("TS should return a named vector of either test statistic(s) or p value(s), but not both.")
return(NULL)
}
if(is.null(names(tmp))) {
message("output of TS routine has to be a named vector!")
return(NULL)
}
}
methodnames=names(tmp)
# do chi square tests if built-in tests are used
pwrchi=NULL
if( (typeTS %in% c(1,2,5)) & (!UseLargeSample) ) {
pwrchi = chi_power(rxy, alpha, B[1], avals, bvals,
nbins, minexpcount, typeTS)
}
# Run tests for power
if(missing(maxProcessor) & Continuous) {
maxProcessor=max(1, parallel::detectCores(logical = FALSE)-1)
}
if(UseLargeSample) {
if(!SuppressMessages) message("Using methods with large sample theories")
if(maxProcessor==1) { # no parallel processing
pwr=power_cont_LS(rxy, alpha, B[1], avals, bvals)
}
else {
cl <- parallel::makeCluster(maxProcessor)
z=parallel::clusterCall(cl, power_cont_LS, rxy=rxy,
alpha=alpha, B=round(B/maxProcessor) , xparam=avals, yparam=bvals)
parallel::stopCluster(cl)
# Average power over cores
pwr=z[[1]]
for(i in 2:maxProcessor) pwr=pwr+z[[i]]
pwr = pwr/maxProcessor
}
}
else {
if(With.p.value) {
pwr=power_newtest(TS, f, avals, TSextra, alpha, B)
}
else {
pwr=powerR(rxy=rxy, xparam=avals, yparam=bvals, TS=TS, typeTS,
TSextra, alpha=alpha, B=B,
SuppressMessages = SuppressMessages, maxProcessor=maxProcessor)
}
}
if(UseLargeSample) {
colnames(pwr) = c("KS", "Kuiper", "CvM", "AD",
"ES large", "ES small" , "EP large", "EP small")
}
else colnames(pwr) = methodnames
rownames(pwr) = rnames
if(!CustomTS) pwr = cbind(pwr, pwrchi)
if(nrow(pwr)==1) {
nm=colnames(pwr)
pwr=c(pwr)
names(pwr)=nm
}
round(pwr, 3)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.