## ------------------------------------------------------------------------
library(lattice)
## ------------------------------------------------------------------------
n=seq(5,45,5)
x=rnorm(sum(n))
y=factor(rep(n,n),labels = paste('n=',n))
densityplot(~x|y,
panel = function(x, ...) {
panel.densityplot(x, col="DarkOliveGreen", ...)
panel.mathdensity(dmath=dnorm,
args=list(mean=mean(x), sd=sd(x)),
col="darkblue")
})
## ------------------------------------------------------------------------
data(quakes)
mini = min(quakes$depth)
maxi = max(quakes$depth)
int = ceiling((maxi - mini)/9)
inf = seq(mini, maxi, int)
quakes$depth.cat = factor(floor(((quakes$depth - mini) / int)),
labels=paste(inf, inf + int, sep="-"))
xyplot(lat ~ long | depth.cat, data = quakes)
## ------------------------------------------------------------------------
data(InsectSprays)
aov.spray = aov(sqrt(count) ~ spray, data = InsectSprays)
aov.spray
## ------------------------------------------------------------------------
termplot(aov.spray, se=TRUE, partial.resid=TRUE, rug=TRUE)
## ------------------------------------------------------------------------
search()
## ------------------------------------------------------------------------
library(grid)
## ------------------------------------------------------------------------
library(help = grid)
## ------------------------------------------------------------------------
set.seed(977) ## set a random seed
x=c(0:4) ## generate the x
p=c(0.1,0.2,0.2,0.2,0.3) ## generate the probability
cp=cumsum(p) ## give the cumulate probability
n=1000 ## set the sample of size 1000
r=numeric(n)
r=x[findInterval(runif(n),cp)+1] ## the inverse transform method
table(r) ## construct a relative frequency table
ct=as.vector(table(r))
ct/sum(ct) ## compare the empirical probability
ct/sum(ct)/p ## the ratio of emparical probability and theorical probability
## ----fig.height=4--------------------------------------------------------
Y=p*n
counts = rbind(table(r),Y) ## combine r and Y into a data.frame
rownames(counts)=c('r','Y') ## gives the row name of the data.frame
counts
barplot(counts/1000, main="multinomial distribution generation",
xlab="x",ylab='prob', col=c("darkblue","red"),
beside=TRUE,legend.text = c('empirical','theoretical'),
args.legend = list(x = "topleft"))
## ------------------------------------------------------------------------
set.seed(53) ## set a random seed
n=1000
k=0 ## counter for accepted
j=0 ## iterations
y=numeric(n)
while (k<n) {
u=runif(1)
j=j+1
x=runif(1) ## #random variate from g
if(x^2*(1-x)>u) {
k=k+1
y[k]=x
}
}
j
hist(y,breaks=15,freq=F, main='empirical vs theoretical') ## plot the histogram
x1=seq(0,1,0.02)
y1=12*x1^2*(1-x1)
lines(x1,y1) ## add the theoretical curve
## ------------------------------------------------------------------------
#generate a Exponential-Gamma mixture
set.seed(473)
n=1000
r=4
beta=2
lambda = rgamma(n, r, beta) ## generate the $\lambda$ from the Gamma(4,2) distribution
#now supply the sample of lambda??s as the parametric of Exponential distribution
y_lambda = rexp(n, lambda) ## generate the mixture
hist(y_lambda,col='gray',ylim=c(0,800),,xlim=c(0,12),main='Exponential-Gamma mixture')
## ------------------------------------------------------------------------
set.seed(432)
m=10000
u=runif(m)
## the function to generate cdf of Beta(3,3)
CDF=function(x){
theta_hat=mean(x^3*u^2*(1-x*u)^2)
CDF_hat=30*theta_hat
#print(CDF_hat)
}
q=seq(0.1,0.9,0.1)
Phi=pbeta(q,3,3) ## q-quantile of Beta(3,3) by function 'pbeta'
## q-quantile of Beta(3,3) by function CDF
cdf=rep(0,length(q))
for (i in 1 : length(q)) {
cdf[i] = CDF(q[i])
}
## Compare the estimator
print(round(rbind(q,cdf,Phi),3))
## ------------------------------------------------------------------------
## the function of generate samples from a Rayleigh distribution
mc=function(x,R=10000,antithetic = TRUE){
u=runif(R/2)
if (!antithetic) v=runif(R/2) else
v=1-u
u=c(u,v)
me=numeric(length(x))
for (i in 1:length(x)){
g=x[i]*(sqrt(2*log(1/(1-u))))
me[i]=mean(g)
}
me
}
sigma=c(1,2,3,4,5) ## given the $\sigma$
mc1=mc(sigma) ## antithetic variables method
mc2=mc(sigma,anti=F) ## inverse method.
print(rbind(sigma,mc1,mc2))
### compare the percent reduction in variance
m=1000 ## repeat 1000 times
mc1=mc2=numeric(m)
sigma=1 ## let $\sigma=1$
for(i in 1:m){
mc1[i]=mc(sigma,R=1000)
mc2[i]=mc(sigma,R=1000,anti=F)
}
print(sd(mc1)) ## the variance of antithetic variables method
print(sd(mc2)) ## the variance of inverse method
print((var(mc2)-var(mc1))/var(mc2)) ## the percent reduction in variance
## ------------------------------------------------------------------------
set.seed(345)
m=10000
u=runif(m)
x=sqrt(-2*log(1-(1-exp(-0.5))*u)) ## inverse method generte sample for inportance function
theta_hat=1/2-(1-exp(-1/2))/sqrt(2*pi)*mean(x)
theta_hat
## ------------------------------------------------------------------------
set.seed(436)
### geerate the random numbers, divide to 1000 groups, each group contains 5000 random numbers.
data=matrix(rlnorm(1000*2000,mean=0,sd=1),nrow=2000,ncol=1000)
logn_mean=rep(0,ncol(data)) ## definate the estimator of mean for 1000 times
logn_median=rep(0,ncol(data)) ## defnate the estimator of median for 1000 times
logn_G=rep(0,ncol(data)) ## definate the estimator of G for 1000 times
## repaet 1000 times
for (i in 1:ncol(data)){
logn_mean[i]=mean(data[,i]) ## compute the sample mean
data[,i]=sort(data[,i],decreasing=F) ## sort the random number from minimum to maximum.
logn_median[i]=(data[,i][ncol(data)/2]+data[,i][ncol(data)/2+1])/2 ## compute the sample median
##compute the Gini ratio via order statistics
n=nrow(data)
d=rep(0,n)
for(j in 1:n) {
d[j]=(2*j-n-1)*data[,i][j]
}
logn_G[i]=mean(d)/(n^2*logn_mean[i])
}
hist(logn_G)
hist(logn_mean)
hist(logn_median)
## ------------------------------------------------------------------------
set.seed(1436)
### geerate the random numbers, divide to 1000 groups, each group contains 5000 random numbers.
data=matrix(runif(1000*2000),nrow=2000,ncol=1000)
unif_mean=rep(0,ncol(data)) ## definate the estimator of mean for 1000 times
unif_median=rep(0,ncol(data)) ## defnate the estimator of median for 1000 times
unif_G=rep(0,ncol(data)) ## definate the estimator of G for 1000 times
## repaet 1000 times
for (i in 1:ncol(data)){
unif_mean[i]=mean(data[,i]) ## compute the sample mean
data[,i]=sort(data[,i],decreasing=F) ## sort the random number from minimum to maximum.
unif_median[i]=(data[,i][ncol(data)/2]+data[,i][ncol(data)/2+1])/2 ## compute the sample median
##compute the Gini ratio via order statistics
n=nrow(data)
d=rep(0,n)
for(j in 1:n) {
d[j]=(2*j-n-1)*data[,i][j]
}
unif_G[i]=mean(d)/(n^2*unif_mean[i])
}
hist(unif_G)
hist(unif_mean)
hist(unif_median)
## ------------------------------------------------------------------------
## write a function n denote sample size, m denote repaet times, mu and sigma denote the parameter in lognorm-distribution
mc_fun=function(n,m,mu,sigma){
set.seed(528)
data=matrix(rlnorm(m*n,mu,sigma),nrow=n,ncol=m)
logn_G=rep(0,m)
logn_mean=rep(0,m)
for(i in 1:m){
logn_mean[i]=mean(data[,i])
data[,i]=sort(data[,i],decreasing=F) ##sort the random number from minimum to maximum.(obtain the order statistics)
##compute the Gini ratio via order statistics]
d=rep(0,n)
for(j in 1:n) d[j]=(2*j-n-1)*data[,i][j]
logn_G[i]=mean(d)/(n^2*logn_mean[i])
}
##print the sample mean.sample sd, and 95% CI??
print(c(mean(logn_G),sd(logn_G),mean(logn_G)-2*sd(logn_G),
mean(logn_G)+2*sd(logn_G)))
}
Gini=mc_fun(1000,1000,1,1)
## ------------------------------------------------------------------------
##Simulate from a Multivariate Normal Distribution
library(MASS)
set.seed(4356)
Sigma = matrix(c(1,0.5,0.5,1),2,2)
Sigma
x=mvrnorm(n=1000, rep(0, 2), Sigma)
cor.test(x[,1],x[,2],method='pearson') ## person correlation coefficient
cor.test(x[,1],x[,2],method='spearman') ## spearman rank correlation coefficient
cor.test(x[,1],x[,2],method='kendall') ## kendall correlation coefficient
## ------------------------------------------------------------------------
set.seed(1)
library(bootstrap)
data=law # data set
head(data)
n=nrow(data) # sample size
theta.hat=cor(data[,1],data[,2]) # the sample correlation.
#### jackknife
theta.jack=numeric(n)
for (i in 1:n) { ##leave one out
data.jack=data[-i,]
theta.jack[i]=cor(data.jack[,1],data.jack[,2])
}
jack.bias=(n-1)*mean(theta.jack-theta.hat) # jackknife bias
jack.sd=sqrt((n-1)*mean((theta.jack - mean(theta.jack))^2)) # jackknife sd
round(c(theta.hat=theta.hat,theta.jack=mean(theta.jack),
jack.bias=jack.bias,jack.sd=jack.sd),5)
## ------------------------------------------------------------------------
library(boot)
set.seed(1)
# define a function to compute the boostrap estimator for sample mean.
air.fun <- function(data,i) {
m <- mean(data$hours[i]) # sample mean
n <- length(i)
v <- (n-1)*var(data$hours[i])/n^2 # sample variance
c(m, v)
}
# use boot function, repeat 1000 times
air.boot <- boot(aircondit, air.fun, R = 1000)
air.boot #t1 denote sample mean bootstrap, t2 denote sample variance bootstrap
# use boot.ci function to obtain the CI
ci <- boot.ci(air.boot,type=c("norm","basic","perc","bca"))
ci
## ------------------------------------------------------------------------
library(bootstrap) #loading packages
data=as.matrix(scor) # let data.frame become matrix
head(data)
n=nrow(data) # sample size
lambda.hat=eigen(t(data)%*%data/n)$values ## sample eigenvalues
theta.hat=lambda.hat[1]/sum(lambda.hat) ## sample estimator
#### jackknife
theta.jack=numeric(n)
for(i in 1:n){ ##leave one out
data.jack=data[-i,]
lambda.jack=eigen(t(data.jack)%*%data.jack/(n-1))$values
theta.jack[i]=lambda.jack[1]/sum(lambda.jack)
}
jack.bias=(n-1)*mean(theta.jack-theta.hat) #jackknife bias
jack.sd=sqrt((n-1)*mean((theta.jack - mean(theta.jack))^2)) #jackknife se
round(c(theta.jack=mean(theta.jack),theta.hat=theta.hat,
jack.bias=jack.bias,jack.sd=jack.sd),7)
## ------------------------------------------------------------------------
library(DAAG) # loading packages
data=ironslag #data set
n=nrow(data) # sample size
# creat a matrix to storage the index of test for leave-two-out CV.
index=matrix(nrow=n*(n-1)/2,ncol=2)
#we obtain all the index via random sampling 2 observations,
# there is 53*52/2=1378 outcomes.
a=c(1:n) # index of sample size, is 1:n
l=1
for(i in 1: (n-1)){
for(j in (i+1):n){
index[l,]=(a[c(i,j)])
l=l+1
}
}
##
e1= e2 = e3 = e4 = matrix(nrow=nrow(index),ncol=2)
# fit models on leave-two-out samples
for(l in 1:nrow(index)){
k=index[l,] #the index of testing data
y=data[-k,2] # the y of training data
x=data[-k,1] # the x of training data
## linear model
J1 = lm(y ~ x) # linear model
yhat1 = J1$coef[1] + J1$coef[2] * data[k,1] # yhat on testing data
e1[l,] = data[k,2] - yhat1
##quadratic model
J2 <- lm(y ~ x + I(x^2)) # quadratic model
yhat2 <- J2$coef[1] + J2$coef[2] * data[k,1] + # yhat on testing data
J2$coef[3] * data[k,1]^2
e2[l,] = data[k,2] - yhat2
##Exponential model
J3 = lm(log(y) ~ x) #exp model
logyhat3 =J3$coef[1]+J3$coef[2] * data[k,1] # yhat on testing data
yhat3 <- exp(logyhat3)
e3[l,] = data[k,2] - yhat3
## Log-Log model
J4 <- lm(log(y) ~ log(x)) # log-log model
logyhat4 <- J4$coef[1] + J4$coef[2] * log(data[k,1]) # yhat on testing data
yhat4 <- exp(logyhat4)
e4[l,] = data[k,2] - yhat4
}
print(c(mse1=mean(e1^2), mse2=mean(e2^2), mse3=mean(e3^2), mse4=mean(e4^2)))
## ------------------------------------------------------------------------
cvm.d<-function(x,y){ #compute the Cramer-von Mises statistic
n<-length(x);m<-length(y)
Fn<-ecdf(x);Gm<-ecdf(y)
W2<-((m*n)/(m+n)^2)*
(sum((Fn(x)-Gm(x))^2)+sum((Fn(y)-Gm(y))^2))
W2
}
## ------------------------------------------------------------------------
library(boot)
attach(chickwts)
x <- sort(as.vector(weight[feed == "soybean"]))
y <- sort(as.vector(weight[feed == "linseed"]))
detach(chickwts)
set.seed(611)
R <- 999 #number of replicates
z <- c(x, y) #pooled sample
n<-length(x)
m<-length(y)
N <- 1:(m+n)
reps <- numeric(R) #storage for replicates
W2 <- cvm.d(x, y)
W2
for (i in 1:R) {
#generate indices k for the first sample
k <- sample(N, size = n, replace = FALSE)
x1 <- z[k]
y1 <- z[-k] #complement of x1
reps[i] <- cvm.d(x1, y1)
}
p <- mean(c(W2, reps) >= W2)
p
## ---- echo=TRUE----------------------------------------------------------
set.seed(1)
f <- function(x, theta,eta) {
stopifnot(theta > 0)
return(1/(theta*pi*(1+((x-eta)/theta)^2)))
}
xt <- x[i-1]
y <- xt+runif(1,-1,1)
m <- 10000
theta <- 1
eta <- 0
x <- numeric(m)
x[1] <- runif(1,-1,1)
k <- 0
u <- runif(m)
for (i in 2:m) {
xt <- x[i-1]
y <- xt+runif(1,min=-1,max=1)
num <- f(y, theta,eta)*dnorm(xt,mean=y,sd=1)
den <- f(xt, theta,eta)*dnorm(y,mean=xt,sd=1)
if (u[i] <= num/den) x[i] <- y else {
x[i] <- xt
k <- k+1 #y is rejected
}
}
print(k)
## ---- echo=TRUE----------------------------------------------------------
index <- 1500:2000
y1 <- x[index]
plot(index, y1, type="l", main="", ylab="x")
## ---- echo=TRUE----------------------------------------------------------
gene<- quantile(x[-(1:1000)],seq(0.1,0.9,0.1))
empi<-qt(seq(0.1,0.9,0.1),1)
rbind(gene,empi)
## ---- echo=TRUE----------------------------------------------------------
b <- 1001 #discard the burnin sample
y <- x[b:m]
probs <- seq(0.1,0.9,0.01)
QR <- qt(probs,df=1)
Q <- quantile(x, probs)
qqplot(QR, Q, main="",
xlab="Cauchy Quantiles", ylab="Sample Quantiles")
hist(y, breaks="scott", main="", xlab="", freq=FALSE)
lines(QR, f(QR, 1,0))
## ---- echo=TRUE----------------------------------------------------------
set.seed(1)
w <- .25 #width of the uniform support set
m <- 5000 #length of the chain
burn <- 1000 #burn-in time
total <- 197
x <- numeric(m) #the chain
## ---- echo=TRUE----------------------------------------------------------
group <- c(125,20,18,34)
prob <- function(y, group) {
# computes (without the constant) the target density
if (y < 0 || y >= 1)
return (0)
return(((2+y)/4)^group[1] *
((1-y)/4)^group[2] * ((1-y)/4)^group[3] *
((y/4)^group[4]))
}
u <- runif(m) #for accept/reject step
v <- runif(m, -w, w) #proposal distribution
x[1] <- 0.5
for (i in 2:m) {
y <- x[i-1] + v[i]
if (u[i] <= prob(y, group) / prob(x[i-1], group))
x[i] <- y else
x[i] <- x[i-1]
}
print(group)
print(round(group/total, 3))
## ---- echo=TRUE----------------------------------------------------------
set.seed(1)
xb <- x[(burn+1):m]
print(mean(xb))
## ------------------------------------------------------------------------
lden=function(theta)125*log(2+theta)+38*log(1-theta)+34*log(theta)
MCMC=function(x,burn_in=0,N=burn_in+5000,print_acc=F){
y=1:N#for MCMC generation
ld=lden#use log-likelihood, things could be a little bit different.
acc=0
for(i in 1:N){
p=runif(1)
y[i]=x=if(runif(1)<exp(ld(p)-ld(x))){acc=acc+1;p}else x
}
if(print_acc)print(acc/N)
y[(burn_in+1):N]
}
plot(MCMC(0.5,print_acc=T))#accept rate~0.15
## ------------------------------------------------------------------------
Gelman.Rubin <- function(psi) {
# psi[i,j] is the statistic psi(X[i,1:j])
# for chain in i-th row of X
psi <- as.matrix(psi)
n <- ncol(psi)
k <- nrow(psi)
psi.means <- rowMeans(psi) #row means
B <- n * var(psi.means) #between variance est.
psi.w <- apply(psi, 1, "var") #within variances
W <- mean(psi.w) #within est.
v.hat <- W*(n-1)/n + B/n+(B/(n*k)) #upper variance est.
r.hat <- v.hat / W #G-R statistic
return(r.hat)
}
## ------------------------------------------------------------------------
M1=cumsum((MC1=MCMC(0.2)))/1:5000#mean_cumsum
M2=cumsum((MC2=MCMC(0.4)))/1:5000#mean_cumsum
M3=cumsum((MC3=MCMC(0.6)))/1:5000#mean_cumsum
M4=cumsum((MC4=MCMC(0.8)))/1:5000#mean_cumsum
psi=rbind(M1,M2,M3,M4)
plot((R=sapply(1:100,function(i)Gelman.Rubin(psi[,1:i]))),main="R value of Gelman-Rubin method",type='l',ylab = "")
res=(1:100)[R[2:100]>1.2][sum(R[2:100]>1.2)]+1
c(mean(c(MC1[res:5000],MC2[res:5000],MC3[res:5000],MC4[res:5000])),
var(c(MC1[res:5000],MC2[res:5000],MC3[res:5000],MC4[res:5000])))
## ------------------------------------------------------------------------
n=25
A=rep(0,length(c(4:n)))
for(k in 4:n){
f=function(x){
pt(sqrt(x^2*k/(k+1-x^2)),k)-pt(sqrt(x^2*(k-1)/(k-x^2)),k-1)}
A[k-3]=uniroot(f,c(0.00001,sqrt(k)-0.00001))$root #solve the intersection points
}
plot(A,type='l')
## ------------------------------------------------------------------------
n=100
A=rep(0,length(c(4:n)))
for(k in 4:n){
f=function(x){
pt(sqrt(x^2*k/(k+1-x^2)),k)-pt(sqrt(x^2*(k-1)/(k-x^2)),k-1)}
A[k-3]=uniroot(f,c(0.00001,sqrt(k)-0.00001))$root #solve the intersection points
}
plot(A,type='l')
## ------------------------------------------------------------------------
n=500
A=rep(0,length(c(4:n)))
for(k in 4:n){
f=function(x){
pt(sqrt(x^2*k/(k+1-x^2)),k)-pt(sqrt(x^2*(k-1)/(k-x^2)),k-1)}
A[k-3]=uniroot(f,c(0.00001,sqrt(k)-0.00001))$root #solve the intersection points
}
plot(A,type='l')
## ------------------------------------------------------------------------
n=1000
A=rep(0,length(c(4:n)))
for(k in 4:n){
f=function(x){
pt(sqrt(x^2*k/(k+1-x^2)),k)-pt(sqrt(x^2*(k-1)/(k-x^2)),k-1)}
A[k-3]=uniroot(f,c(0.00001,sqrt(k)-0.00001))$root #solve the intersection points
}
plot(A,type='l')
## ------------------------------------------------------------------------
## define the pdf of cauchy distribution
pdf_cauchy=function(t,theta,eta){
1/(theta*pi*(1+((t-eta)/theta)^2))
}
x=seq(0,10,0.1)
num_int=rep(0,length(x))
p=rep(0,length(x))
# let theta=1,eta=0.
for(i in 1:length(x)){
num_int[i]=integrate(pdf_cauchy, -Inf, x[i],theta=1,eta=0)$value
p[i]=pcauchy(x[i],location = 0, scale = 1) # by pcauchy
}
plot(x,num_int) # plot the num_int
lines(x,p,col='green') # plot the pcauchy
# let theta=1,eta=1.
for(i in 1:length(x)){
num_int[i]=integrate(pdf_cauchy, -Inf, x[i],theta=1,eta=1)$value
p[i]=pcauchy(x[i],location = 1, scale = 1) # by pcauchy
}
plot(x,num_int) # plot the num_int
lines(x,p,col='green') # plot the pcauchy
## ------------------------------------------------------------------------
## use loops to fit linear model.
formulas = list(
mpg ~ disp,
mpg ~ I(1 / disp), mpg~disp+wt, mpg~I(1/disp)+wt
)
out1=list()
for(i in 1:4){
out1[[i]]=lm(formulas[[i]],data=mtcars)
}
out1
### use lapply() function to fit linear model
out2=lapply(formulas,lm,data=mtcars)
out2
## ------------------------------------------------------------------------
bootstraps <- lapply(1:10, function(i) {
rows <- sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
})
### loops
out3=list()
for(i in 1:10){
out3[[i]]=lm(mpg ~ disp,data=bootstraps[[i]])
}
out3
### lapply
newdata=list()
for(i in 1:10){
newdata[[i]]=bootstraps[[i]][,c(1,3)]
}
out4=lapply(newdata,lm)
out4
## ------------------------------------------------------------------------
rsq <- function(mod) summary(mod)$r.squared
## compute R2 for out1
R2_1=lapply(out1,rsq)
R2_1
## compute R2 for out2
R2_2=lapply(out2,rsq)
R2_2
## compute R2 for out3
R2_3=lapply(out3,rsq)
R2_3
## compute R2 for out4
R2_4=lapply(out4,rsq)
R2_4
## ------------------------------------------------------------------------
trials <- replicate(
100,
t.test(rpois(10, 10), rpois(7, 10)),
simplify = FALSE
)
## define a function to extract p.value
p_fun=function(f)(f$p.value)
sapply(trials,p_fun)
## ------------------------------------------------------------------------
chisq.test1=function (x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)),
rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)
{
DNAME <- deparse(substitute(x))
if (is.data.frame(x))
x <- as.matrix(x)
if (is.matrix(x)) {
if (min(dim(x)) == 1L)
x <- as.vector(x)
}
if (!is.matrix(x) && !is.null(y)) {
if (length(x) != length(y))
stop("'x' and 'y' must have the same length")
DNAME2 <- deparse(substitute(y))
xname <- if (length(DNAME) > 1L || nchar(DNAME, "w") >
30)
""
else DNAME
yname <- if (length(DNAME2) > 1L || nchar(DNAME2, "w") >
30)
""
else DNAME2
OK <- complete.cases(x, y)
x <- factor(x[OK])
y <- factor(y[OK])
if ((nlevels(x) < 2L) || (nlevels(y) < 2L))
stop("'x' and 'y' must have at least 2 levels")
x <- table(x, y)
names(dimnames(x)) <- c(xname, yname)
DNAME <- paste(paste(DNAME, collapse = "\n"), "and",
paste(DNAME2, collapse = "\n"))
}
if (any(x < 0) || anyNA(x))
stop("all entries of 'x' must be nonnegative and finite")
if ((n <- sum(x)) == 0)
stop("at least one entry of 'x' must be positive")
if (simulate.p.value) {
setMETH <- function() METHOD <<- paste(METHOD, "with simulated p-value\n\t (based on",
B, "replicates)")
almost.1 <- 1 - 64 * .Machine$double.eps
}
if (is.matrix(x)) {
METHOD <- "Pearson's Chi-squared test"
nr <- as.integer(nrow(x))
nc <- as.integer(ncol(x))
if (is.na(nr) || is.na(nc) || is.na(nr * nc))
stop("invalid nrow(x) or ncol(x)", domain = NA)
sr <- rowSums(x)
sc <- colSums(x)
E <- outer(sr, sc, "*")/n
v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3
V <- outer(sr, sc, v, n)
dimnames(E) <- dimnames(x)
if (simulate.p.value && all(sr > 0) && all(sc > 0)) {
setMETH()
tmp <- .Call(C_chisq_sim, sr, sc, B, E)
STATISTIC <- sum(sort((x - E)^2/E, decreasing = TRUE))
PARAMETER <- NA
PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC))/(B +
1)
}
else {
if (simulate.p.value)
warning("cannot compute simulated p-value with zero marginals")
if (correct && nrow(x) == 2L && ncol(x) == 2L) {
YATES <- min(0.5, abs(x - E))
if (YATES > 0)
METHOD <- paste(METHOD, "with Yates' continuity correction")
}
else YATES <- 0
STATISTIC <- sum((abs(x - E) - YATES)^2/E)
PARAMETER <- (nr - 1L) * (nc - 1L)
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
}
}
else {
if (length(dim(x)) > 2L)
stop("invalid 'x'")
if (length(x) == 1L)
stop("'x' must at least have 2 elements")
if (length(x) != length(p))
stop("'x' and 'p' must have the same number of elements")
if (any(p < 0))
stop("probabilities must be non-negative.")
if (abs(sum(p) - 1) > sqrt(.Machine$double.eps)) {
if (rescale.p)
p <- p/sum(p)
else stop("probabilities must sum to 1.")
}
METHOD <- "Chi-squared test for given probabilities"
E <- n * p
V <- n * p * (1 - p)
STATISTIC <- sum((x - E)^2/E)
names(E) <- names(x)
if (simulate.p.value) {
setMETH()
nx <- length(x)
sm <- matrix(sample.int(nx, B * n, TRUE, prob = p),
nrow = n)
ss <- apply(sm, 2L, function(x, E, k) {
sum((table(factor(x, levels = 1L:k)) - E)^2/E)
}, E = E, k = nx)
PARAMETER <- NA
PVAL <- (1 + sum(ss >= almost.1 * STATISTIC))/(B +
1)
}
else {
PARAMETER <- length(x) - 1
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
}
}
names(STATISTIC) <- "X-squared"
names(PARAMETER) <- "df"
if (any(E < 5) && is.finite(PARAMETER))
warning("Chi-squared approximation may be incorrect")
structure(list(statistic = STATISTIC, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME, observed = x,
expected = E, residuals = (x - E)/sqrt(E), stdres = (x -
E)/sqrt(V)), class = "htest")
}
## ------------------------------------------------------------------------
chisq.test2=function (x, y)
{
if (length(x) != length(y))
stop("'x' and 'y' must have the same length")
OK <- complete.cases(x, y)
x <- factor(x[OK])
y <- factor(y[OK])
if ((nlevels(x) < 2L) || (nlevels(y) < 2L))
stop("'x' and 'y' must have at least 2 levels")
x <- table(x, y)
n <- sum(x)
p = rep(1/length(x), length(x))
p <- p/sum(p)
sr <- rowSums(x)
sc <- colSums(x)
E <- outer(sr, sc, "*")/n
v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3
V <- outer(sr, sc, v, n)
dimnames(E) <- dimnames(x)
STATISTIC <- sum(sort((x - E)^2/E, decreasing = TRUE))
STATISTIC
}
options(warn=-1)
x <- c(89,37,30,28,2,14,25)
y <- c(29,47,10,8,14,32,52)
chisq.test1(x,y)$statistic
chisq.test2(x,y)
library(microbenchmark)
microbenchmark(
chisq.test1(x,y)$statistic,
chisq.test2(x,y)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.