Nothing
#' Tests for the univariate two-sample problem
#'
#' This function runs a number of two sample tests using Rcpp and parallel computing.
#'
#' For details consult vignette("R2sample","R2sample")
#'
#' @param x a vector of numbers if data is continuous or of counts if data is discrete or a list with the data
#' @param y a vector of numbers if data is continuous or of counts if data is discrete.
#' @param vals =NA, a vector of numbers, the values of a discrete random variable. NA if data is continuous data.
#' @param TS routine to calculate test statistics for non-chi-square tests
#' @param TSextra additional info passed to TS, if necessary
#' @param wx A numeric vector of weights of x.
#' @param wy A numeric vector of weights of y.
#' @param B =5000, number of simulation runs for permutation test
#' @param nbins =c(50,10), number of bins for chi square tests.
#' @param minexpcount =5, minimum required expected counts for chi-square tests.
#' @param maxProcessor maximum number of cores to use. If missing (the default) no parallel processing is used.
#' @param UseLargeSample should p values be found via large sample theory if n,m>10000?
#' @param samplingmethod ="Binomial" or "independence" for discrete data
#' @param rnull a function that generates data from a model, possibly with parameter estimation.
#' @param doMethods ="all" a vector of codes for the methods to include. If "all", all methods are used.
#' @param SuppressMessages = FALSE print informative messages?
#' @return A list of two numeric vectors, the test statistics and the p values.
#' @export
#' @examples
#' R2sample::twosample_test(rnorm(1000), rt(1000, 4), B=1000)
#' myTS=function(x,y) {z=c(mean(x)-mean(y),sd(x)-sd(y));names(z)=c("M","S");z}
#' R2sample::twosample_test(rnorm(1000), rt(1000, 4), TS=myTS, B=1000)
#' vals=1:5
#' x=table(sample(vals, size=100, replace=TRUE))
#' y=table(sample(vals, size=100, replace=TRUE, prob=c(1,1,3,1,1)))
#' R2sample::twosample_test(x, y, vals)
twosample_test=function(x, y, vals=NA, TS, TSextra, wx=rep(1, length(x)),
wy=rep(1, length(y)), B=5000, nbins=c(50,10),
minexpcount=5, maxProcessor, UseLargeSample,
samplingmethod="Binomial", rnull,
SuppressMessages = FALSE, doMethods="all") {
default.methods = list(cont=c("EP small", "ZA", "ZK", "Wassp1"),
disc=c("small", "ZA", "Kuiper", "Wassp1"))
if(is.list(x)) {
y=x$y
if("vals"%in%names(x)) vals=x$vals
if("wx"%in%names(x)) wx=x$wx
if("wy"%in%names(x)) wy=x$wy
x=x$x
}
if(missing(maxProcessor)) {
if(min(c(length(x), length(y)))<1e5) {
maxProcessor=1
}
else {
maxProcessor=max(1, parallel::detectCores(logical = FALSE)-1)
if(!SuppressMessages) message(paste("Using",maxProcessor," cores"))
}
}
if(!is.numeric(samplingmethod))
samplingmethod=ifelse(samplingmethod=="independence", 1, 2)
if(missing(TSextra)) TSextra = list(samplingmethod=samplingmethod)
else TSextra=c(TSextra, samplingmethod=samplingmethod)
if(!missing(rnull)) TSextra=c(TSextra, rnull=rnull)
Continuous=ifelse(any(is.na(vals)), TRUE, FALSE)
CustomTS=ifelse(missing(TS), FALSE, TRUE)
WithWeights=ifelse(all(c(wx,wy)==1), FALSE, TRUE)
if(missing(UseLargeSample))
UseLargeSample=ifelse(min(length(x), length(y))<1e4, FALSE, TRUE)
outchi = list(statistics=NULL, p.values=NULL)
if(test_methods(doMethods,Continuous, UseLargeSample, WithWeights))
return(NULL)
# what methods are to be run?
if(!CustomTS) { # do included methods
if(Continuous) {
if(!WithWeights) { # No weights
typeTS = 1
TS=TS_cont
dta=list(x=x, y=y)
tmp=TS(x,y)
outchi = chi_test(dta, nbins, minexpcount, typeTS=1)
}
else { # weighted data
typeTS = 2
TS=TSw_cont
dta=list(x=x, y=y, wx=wx, wy=wy)
tmp=TS(x, y, wx, wy)
outchi = chi_test(dta, nbins, minexpcount, typeTS=2)
doMethods=c("KS", "Kuiper", "CvM", "AD", names(outchi[[1]]))
if(min(wx,wy)<0.01 | max(wx,wy)>5)
if(!SuppressMessages) message("Some of the weights are either exceptionally small or large. In either case the tests can be unreliable!")
}
}
else {
typeTS=5
TS=TS_disc
dta=list(x=x, y=y, vals=vals)
TSextra=c(TSextra, list(adw=weights(dta)))
tmp=TS(x, y, vals, TSextra$adw)
outchi = chi_test(dta, nbins, minexpcount, typeTS=5)
}
chimethods = names(outchi[[1]])
}
else { # do user-supplied tests
if(substr(deparse(TS)[2], 1, 5)==".Call") {
if(maxProcessor>1) {
if(!SuppressMessages) message("Parallel Programming is not possible if custom TS is written in C++. Switching to single processor")
maxProcessor=1
}
}
if(Continuous) {
if(length(formals(TS))==2) {
typeTS=3
dta=list(x=x, y=y)
tmp=TS(x, y)
}
else {
typeTS=4
dta=list(x=x, y=y)
tmp=TS(x, y, TSextra)
}
}
else {
if(length(formals(TS))==3) {
typeTS=6
dta=list(x=x, y=y, vals=vals)
tmp=TS(x, y, vals)
}
else {
typeTS=7
dta=list(x=x, y=y, vals=vals)
tmp=TS(x, y, vals, TSextra)
}
}
if(is.null(names(tmp))) {
message("output of TS routine has to be a named vector!")
return(NULL)
}
}
# set number of processors for parallel programming
m=parallel::detectCores(logical = FALSE)
if(maxProcessor>1) maxProcessor=min(maxProcessor, m-1)
# Check whether continuous data has many ties
if(Continuous) {
if(length(unique(c(x,y)))*3<length(c(x,y)))
if(!SuppressMessages) message("There appear to be many ties. If data is actually discrete, run discrete
version of twosample_test. For details see Help\n")
}
# if data is discrete, check that all vectors have equal lengths, that there are no empty classes
# and ZK method is not run
if(!Continuous) {
if(length(x)!=length(vals) | length(y)!=length(vals)) {
message("x, y and vals have to have equal lengths. (x or y can be 0)!\n")
return(NULL)
}
if(min(x+y)==0) {
message("Bins with 0 counts of both x and y are not allowed!\n")
return(NULL)
}
}
# if data is continuous and min(n,m)>10000, use asymptotic formulas for p value
if(UseLargeSample) {
if(!missing(rnull)) {
message("Large sample methods are not implemented for GOF-Twosample hybrid problem")
return(NULL)
}
statistics = tmp[c("KS", "Kuiper", "CvM", "AD")]
p.values=asymptotic_pvalues(statistics, length(x), length(y))
s = c(statistics, outchi$statistics)
p = c(p.values, outchi$p.values)
return(list(statistics=round(s,4), p.values=round(p,4)))
}
# if either only one core is present, B=0 or maxProcessor=1, run testC.
if(B==0 | maxProcessor==1) {
outTS = testC(dta, TS, typeTS, TSextra,B=B)
}
else {
# run testC in parallel. Use one less core than is present, at most maxProcessor.
cl=parallel::makeCluster(maxProcessor)
z=parallel::clusterCall(cl, testC, dta=dta,
TS=TS, typeTS=typeTS, TSextra=TSextra, B= round(B/m))
parallel::stopCluster(cl)
# average p values over cores
p=z[[1]]$p.values
for(i in 2:m) p=p+z[[i]]$p.values
p = round(p/m, 4)
outTS = list(statistics=z[[1]]$statistics, p.values=p)
}
if(CustomTS) return(signif.digits(outTS))
s = c(outTS$statistics, outchi$statistics)
p = c(outTS$p.values, outchi$p.values)
out = list(statistics=s, p.values=p)
out=signif.digits(out)
if(doMethods[1]=="all") return(out)
if(typeTS%in%c(2,6)) return(out)
if(doMethods[1]=="default") {
if(Continuous) doMethods = default.methods$cont
else doMethods = default.methods$disc
}
out = list(statistics=s[doMethods], p.values=p[doMethods])
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.