## ------------------------------------------------------------------------
MCestimate=function(n,x){
u=runif(n) #Generate iid Uniform(0,1) random numbers
cdf=numeric(length(x))
for(i in 1:length(x)){
g=30*x[i]^3*u^2*(1-u*x[i])^2
cdf[i]=mean(g) #the estimated value of the cdf
}
phi=pbeta(x,3,3) #the estimated values returned by the pbeta function
print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function.
}
MCestimate(1000,c(1,2,3,4,5,6,7,8,9)/10)
## ------------------------------------------------------------------------
chisq_test=function(x, y) {
n=sum(x) + sum(y)
ni._x=sum(x)
ni._y=sum(y)
chisq.statistic=0
for (i in seq_along(x)) {
n.j=x[i] + y[i]
expected_x=(ni._x*n.j)/n
expected_y=(ni._y*n.j)/n
chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x
chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y
}
chisq.statistic
}
chisq_test(c(3,8,4,9),c(6,9,3,5))
## ------------------------------------------------------------------------
Jacknife.estimate=function(x,y){
set.seed(1)
n=length(x)
theta.hat=cor(x,y) #the correlation estimate
print(theta.hat)
#compute the jackknife replicates, leave-one-out estimates
theta.jack=numeric(n)
for(i in 1:n){
theta.jack[i]=cor(x[-i],y[-i])
}
bias=(n-1)*(mean(theta.jack)-theta.hat)
print(bias) #a jackknife estimate of the bias of the correlation statistic
se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
print(se) #a jackknife estimate of the standard error of the correlation statistic
}
Jacknife.estimate(c(0.1,0.5,1,1.2,0.2,0.4,0.8,0.6,0.8,1.3),c(4,2.7,5.4,3.6,6.3,3.9,4.7,3.9,2.8,5.6))
## ------------------------------------------------------------------------
ID=c(1,2,3,4)
age=c(25,34,28,52)
status=c("poor","improved","excelled","poor")
patientdata<-data.frame(ID,age,status)
patientdata
## ------------------------------------------------------------------------
m1=matrix(1,nr=2,nc=2)
m2=matrix(2,nr=2,nc=2)
m=rbind(m1,m2) #merge matrices
m
n=cbind(m1,m2)
n
## ------------------------------------------------------------------------
x=rnorm(10)
y=rnorm(10)
plot(x,y,xlab="Ten random values",ylab="Ten other values",xlim=c(-2,2),ylim=c(-2,2),main="How to customize a plot with R")
## ------------------------------------------------------------------------
#Solution 1
x=c(0,1,2,3,4)
p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities
cp=cumsum(p)
n=1000
r=numeric(n)
r=x[findInterval(runif(n),cp)+1] # the 1000 random numbers generated from the distribution of X
ct=as.vector(table(r)) # take the numbers of every value of X in R and form a vector
pr=ct/sum(ct) #the empirical probabilities
s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function
st=as.vector(table(s)) # take the numbers of every value of X in R and form a vector
psample=st/sum(st) #the empirical probabilities
comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample)
#the comparison of the empirical,the theoretical and the sample probabilities
comparison
a=c(ct,st)
rnames=c(0,1,2,3,4)
cnames=c("ct","st")
mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function
mymatrix
barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function
## ------------------------------------------------------------------------
#Solution 2
p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities
x=rep(1,1000) # replicates the values of 1
for(i in 1:1000){
u=runif(1)
if(u<0.1) x[i]=0
else{
if(u<0.3) x[i]=1
else{
if(u<0.5) x[i]=2
else{
if(u<0.7) x[i]=3
else x[i]=4
}
}
}
}
# x :the 1000 random numbers generated from the distribution of X
ct=as.vector(table(x)) # take the numbers of every value of X in R and form a vector
pr=ct/sum(ct) #the empirical probabilities
s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function
st=as.vector(table(s)) # take the numbers of every value of X in R and form a vector
psample=st/sum(st) #the empirical probabilities
comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample)
#the comparison of the empirical,the theoretical and the sample probabilities
comparison
a=c(ct,st)
rnames=c(0,1,2,3,4)
cnames=c("ct","st")
mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function
mymatrix
barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function
## ------------------------------------------------------------------------
# beta(3,2) with pdf f(x)=12x^2(1-x),0<x<1
# g(x)=x^2,0<x<1 and c=12
n=1000
k=0 #counter for accepted
j=0 #iterations
y=numeric(n)
while(k<n){
u=runif(1)
j=j+1
x=runif(1) #andom variate from g
if(x^2*(1-x)>u){
#we accept x
k=k+1
y[k]=x #deliver y=x
}
}
j
# y:the 1000 random numbers generated from the Beta(a,b) distribution
plot(y) #the scatter plot of the 1000 random numbers generated from the Beta(a,b) distribution
hist(y,prob=TRUE,main=expression(f(x)==24*x*(1-x)-12*x^2)) #density histogram of sample
t=seq(0,1,.01)
lines(t,12*t^2*(1-t)) #density curve f(x)
## ------------------------------------------------------------------------
#generate a Poisson-Gamma mixture
n=1000
r=4
beta=2
lambda=rgamma(n, r, beta) #lambda is random
#now supply the sample of lambda???s as the exponential mean
y=rexp(n, lambda) #the 1000 random numbers generated from the mixture distribution
plot(y) # the scatter plot of the 1000 random numbers generated from the mixture distribution
## ------------------------------------------------------------------------
#write a function to compute aMonte Carlo estimate of the Beta(3,3)
my_MCestimate=function(n,x){
u=runif(n) #Generate iid Uniform(0,1) random numbers
cdf=numeric(length(x))
for(i in 1:length(x)){
g=30*x[i]^3*u^2*(1-u*x[i])^2
cdf[i]=mean(g) #the estimated value of the cdf
}
phi=pbeta(x,3,3) #the estimated values returned by the pbeta function
print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function.
}
x=c(1,2,3,4,5,6,7,8,9)/10
n=1000
my_MCestimate(n,x) #the estimated values returned by the my_MCestimate function
## ------------------------------------------------------------------------
#Implement a function to generate samples from a $Rayleigh(\sigma)$ distribution,using antithetic variables.
MC.av=function(x, m= 10000, antithetic = TRUE) {
u=runif(m/2) #Generate random numbers u(1),u(2),...,u(m/2)~Unifrom(0,1)
if (!antithetic) v=runif(m/2) else
v=1-u
u=c(u, v)
sigma=10
cdf=numeric(length(x))
for (i in 1:length(x)) {
g=u*x[i]^2/(sigma^2)* exp(-(u * x[i])^2/2*(sigma)^2)
cdf[i]=mean(g) #the estimated value of the cdf
}
cdf
}
x=seq(1.0,5.0,length=10)
set.seed(123)
MC1=MC.av(x,anti=FALSE) ##generate samples from the distribution without using antithetic variables
set.seed(123)
MC2=MC.av(x) #generate samples from the distribution by using antithetic variables
print(round(rbind(x,MC1,MC2),10))
c(sd(MC1),sd(MC2))
print((var(MC1)-var(MC2))/var(MC1)) #the percent reduction in variance of $\frac{X+X'}{2}$ compared with$\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$
## ------------------------------------------------------------------------
m=10000
theta.hat=se=numeric(2)
g=function(x) {
((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1)
}
x=rnorm(m) #using f1
fg1=g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2))
theta.hat[1]=mean(fg1) #The first estimate
se[1]=sd(fg1) #The variance of the first estimate
x=rgamma(m,3,1) #using f2
x=x[x>1] #Limit the range of x,let x>1
fg2=g(x)/((1/2)*(x^2)*exp(-x))
theta.hat[2]=mean(fg2) #The second estimate
se[2]=sd(fg2) #The variance of the second estimate
rbind(theta.hat, se)
#Draw the curves of g, f1 and f2, then compare them
curve(g,1,5,ylab="",main="The comparsion of the curves of g,f1 and f2")
f1=function(x){
(1/sqrt(2*pi))*exp(-(x^2)/2)
}
curve(f1,1,5,add=TRUE,col="red")
f2=function(x){
(1/2)*(x^2)*exp(-x)
}
curve(f2,1,5,add=TRUE,col="green")
#Draw the curves of fg1 and fg2, and compare them
fg1=function(x){
g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2))
}
curve(fg1,1,5,ylab="",main="The comparsion of the curves of fg1 and fg2")
fg2=function(x){
g(x)/((1/2)*(x^2)*exp(-x))
}
curve(fg2,1,5,add=TRUE,col="red")
## ------------------------------------------------------------------------
m=10000
theta.hat=se=numeric(1)
g=function(x) {
((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1)
}
x=rgamma(m,3,1) #using f
x=x[x>1] #Limit the range of x,let x>1
fg=g(x)/((1/2)*(x^2)*exp(-x))
theta.hat[1]=mean(fg) #The second estimate
se[1]=sd(fg) #The variance of the second estimate
rbind(theta.hat, se)
## ------------------------------------------------------------------------
#the number of test n=1000
#estimate the G
n=1000
G.hat=matrix(0,nrow=3,ncol=n)
mean.G.hat=numeric(3)
median.G.hat=numeric(3)
deciles.G.hat=numeric(3)
x=matrix(0,nrow=3,ncol=n)
mu.hat=matrix(0,nrow=3,ncol=n)
t=numeric(n)
for(k in 1:3){
for(i in 1:n){
# i reprent the i-th test
x[1,1:n]=sort(rlnorm(n)) #generated the random number of x1,...,xn from the standard lognormal distribution,and sort them
x[2,1:n]=sort(runif(n)) #generated the random number of x1,...,xn from the uniform distribution,and sort them
x[3,1:n]=sort(rbinom(n,1,0.1)) #generated the random number of x1,...,xn from the Bernoulli(0.1),and sort them
mu.hat[k,i]=sum(x[k,1:n])/n #mu replaced by the mean of x
for(j in 1:n){
t[j]=(2*j-n-1)*x[k,j]
}
G.hat[k,i]=sum(t[1:n])/(mu.hat[k,i]*(n^2)) #the estimate of G from the k-th distribution
}
}
#Estimate the mean,median and deciles of G.hat if X is standard lognorm,uniform distribution and benoulli(0.1)
mean.G.hat=apply(G.hat,1,mean) #the mean of G.hat
median.G.hat=apply(G.hat,1,median) #the median of G.hat
deciles.G.hat[1]=quantile(G.hat[1,1:n],0.1) #the deciles of G.hat
deciles.G.hat[2]=quantile(G.hat[2,1:n],0.1)
deciles.G.hat[3]=quantile(G.hat[3,1:n],0.1)
round(rbind(mean.G.hat,median.G.hat,deciles.G.hat),3) #output the mean,median and deciles of G.hat when x is standard lognormal,the uniform distribution and Bernoulli(0.1)
#construct the density histograms of the replicates
hist(G.hat[1,1:n],main="density histogram of G.hat when x is standard lognormal")
hist(G.hat[2,1:n],main="density histogram of G.hat when x is uniform distribution")
hist(G.hat[3,1:n],main="density histogram of G.hat when x is Bernoulli(0.1)")
## ------------------------------------------------------------------------
#construct a function to return the 95% confidence interval of gamma
CI=function(n,a,b,alpha=0.05){
t=numeric(n)
g=numeric(n)
for(i in 1:n){
x=sort(rlnorm(n,a,b))
mu.hat=mean(x)
for(j in 1:n){
t[j]=(2*j-n-1)*x[j]
}
g[i]=mean(t)/(n*mu.hat)
}
gamma.hat=mean(g)
sd.hat=sqrt(sum((g-mean(g))^2))/n #estimate the standard deviation of gamma.hat
c=qnorm(1-alpha/2,0,1)*sd.hat
CI1=gamma.hat-c #the confidence lower limit of gamma
CI2=gamma.hat+c #the confidence upper limit of gamma
c(CI1,CI2)
}
CI(1000,1,1,0.05) #construct an approximate 95% confidence interval of gamma when the parameters are given
#assess the coverage rate
d=matrix(0,nrow=100,ncol=2)
for(k in 1:100){
d[k,1:2]=CI(1000,1,1,0.05)
}
CI1=d[1:100,1]
CI2=d[1:100,2]
#simulate the true value of gamma
gamma=2*pnorm(1/sqrt(2),0,1)-1 #I get this value in an established paper
gamma
m=0
for(i in 1:100){
if(gamma<CI1[i]) m=m
else{
if(gamma>CI2[i]) m=m
else{
m=m+1
}
}
}
m
CP=m/100
CP #the coverage rate
## ------------------------------------------------------------------------
n=1000
mu=0
sigma=0.9
repsig=400
repmean=100
system.time({
G=function(x,mu=mean(x),n=length(x))sort(x)%*%(1:n*2-n-1)/n/n/mu
mG=function(sig){mu=mean(rlnorm(repmean*n,0,sig));mean(sapply(1:repmean,function(...){G(rlnorm(n,0,sig),mu)}))}
X=rlnorm(n,mu,sigma)
print((CI_sig=sqrt(var(log(X))/qchisq(c(0.9875,0.0125),df=n-1)*(n-1))))
rsig=sqrt(var(log(X))/qchisq(runif(repsig,0.0125,0.9875),df=n-1)*(n-1))
Gs=sapply(rsig,mG)
print(quantile(Gs,c(0.0125,0.9875)))
})
## ------------------------------------------------------------------------
library(MASS)
Sigma=matrix(c(1,2,1,9),2,2)
A=mvrnorm(n=1000, rep(0, 2), Sigma) #the sampled distribution is bivariate normal
plot(A[1:1000,1],A[1:1000,2])
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="pearson",conf.level=0.95)
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="kendall",conf.level=0.95)
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="spearman",conf.level=0.95)
#compute the power
power=matrix(0,nrow=20,ncol=3)
for(i in 1:20) {
pvalues <- replicate(100, expr = {
Sigma=matrix(c(1,2,1,9),2,2)
A=mvrnorm(n=100, rep(0, 2), Sigma)
ctest=cor.test(A[1:100,1],A[1:100,2],alternative="two.sided",method=c("pearson","kendall","spearman"),conf.level=0.95)
ctest$p.value
} )
power[i,1:3]=mean(pvalues<=0.05)
}
## ------------------------------------------------------------------------
set.seed(1)
library(bootstrap) #for the law data
LSAT=law$LSAT
GPA=law$GPA
n=nrow(law)
system.time({
theta.hat=cor(LSAT,GPA) #the correlation estimate
print(theta.hat)
#compute the jackknife replicates, leave-one-out estimates
theta.jack=numeric(n)
for(i in 1:n){
theta.jack[i]=cor(LSAT[-i],GPA[-i])
}
bias=(n-1)*(mean(theta.jack)-theta.hat)
print(bias) #a jackknife estimate of the bias of the correlation statistic
se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
print(se) #a jackknife estimate of the standard error of the correlation statistic
})
## ------------------------------------------------------------------------
set.seed(1)
library(boot) #for boot and boot.ci
n=100 #the times of bootstrap replicates
system.time({
x=c(3,5,7,18,43,85,91,98,100,130,230,487)
boot.mean=function(x,i) mean(x[i])
mean.hat=mean(x)
boot.obj=boot(x,statistic = boot.mean,R=n)
print(boot.ci(boot.obj,type=c("norm","basic","perc","bca")))
})
## ------------------------------------------------------------------------
set.seed(1)
data(scor, package = "bootstrap") #for the score data
library(MASS)
scor.m= matrix(as.numeric(t(scor)), 5, 88)
n=nrow(scor) #sample size
mu=rowMeans(scor.m)
mu0=matrix(mu,nrow=5,ncol=n)
system.time({
sigma.hat=((scor.m-mu0)%*%t(scor.m-mu0))/(n-1) # the MLE of covariance matrix
ev.hat=sort(eigen(sigma.hat)$values,decreasing=T)
theta.hat=ev.hat[1]/sum(ev.hat)
theta.hat #the estimate of theta
#obtain the jackknife estimates of bias and standard error of theta.hat
theta.jack=numeric(n)
for(i in 1:n){
scor.m.jack= scor.m[,-i] #compute the jackknife replicates, leave-one-out estimates
mu1=matrix(mu,nrow=5,ncol=n-1)
sigma.jack=((scor.m.jack-mu1)%*%t(scor.m.jack-mu1))/(n-1)
ev.jack=sort(eigen(sigma.jack)$values,decreasing=T)
theta.jack[i]=ev.jack[1]/sum(ev.jack)
}
theta.jack
bias=(n-1)*(mean(theta.jack)-theta.hat)
print(bias) #a jackknife estimate of the bias
se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
print(se) #a jackknife estimate of the standard error
})
## ------------------------------------------------------------------------
set.seed(1)
library(DAAG)
attach(ironslag)
n=length(magnetic) #in DAAG ironslag
m=n*(n-1)/2 # leave-two-out cross validation require training and validating the model m times
spe1=spe2=spe3=spe4=numeric(m)
# for leave-two-out cross validation
# fit models on leave-two-out samples
k=0
for(i in 1:(n-1)){
for(j in (i+1):n){
k=k+1
y=magnetic[-c(i,j)] #leave-two-out
x=chemical[-c(i,j)]
J1=lm(y ~ x)
yhat11=J1$coef[1] + J1$coef[2] * chemical[i] #the estimate value of the i-th value
yhat12=J1$coef[1] + J1$coef[2] * chemical[j] #the estimate value of the j-th value
spe1[k]=((magnetic[i]-yhat11)^2+(magnetic[j]-yhat12)^2)/2 #the squared prediction error of the k-th
J2=lm(y ~ x + I(x^2))
yhat21=J2$coef[1] + J2$coef[2] * chemical[i] +J2$coef[3] * chemical[i]^2
yhat22=J2$coef[1] + J2$coef[2] * chemical[j] +J2$coef[3] * chemical[j]^2
spe2[k]=((magnetic[i]-yhat21)^2+(magnetic[j]-yhat22)^2)/2
J3=lm(log(y) ~ x)
logyhat31 =J3$coef[1] + J3$coef[2] * chemical[i]
logyhat32 =J3$coef[1] + J3$coef[2] * chemical[j]
yhat31=exp(logyhat31)
yhat32=exp(logyhat32)
spe3[k]=((magnetic[i]-yhat31)^2+(magnetic[j]-yhat32)^2)/2
J4=lm(log(y) ~ log(x))
logyhat41 =J4$coef[1] + J4$coef[2] * log(chemical[i])
logyhat42 =J4$coef[1] + J4$coef[2] * log(chemical[j])
yhat41=exp(logyhat41)
yhat42=exp(logyhat42)
spe4[k]=((magnetic[i]-yhat41)^2+(magnetic[j]-yhat42)^2)/2
}
}
c(mean(spe1),mean(spe2),mean(spe3),mean(spe4))
#According to the prediction error criterion, Model 2, the quadratic model, would be the best fit for the data.
J2
par(mfrow = c(1, 2)) #layout for graphs
plot(J2$fit, J2$res) #residuals vs fitted values
abline(0, 0) #reference line
qqnorm(J2$res) #normal probability plot
qqline(J2$res) #reference line
par(mfrow = c(1, 1)) #restore display
## ------------------------------------------------------------------------
set.seed(1)
library(nortest)
chickwts
x=sort(as.vector(chickwts$weight[chickwts$feed == "soybean"]))
y=sort(as.vector(chickwts$weight[chickwts$feed == "linseed"]))
n=length(x)
m=length(y)
R=999 #number of replicates
z=c(x, y) #pooled sample
K=1:26
D=numeric(R) #storage for replicates
fnx=ecdf(x) #the ecdf of x
fny=ecdf(y) #the ecdf of y
options(warn = -1)
D0=(sum((fnx(x)-fny(x))^2)+sum((fnx(y)-fny(y))^2))/((m*n)/((m+n)^2)) #statistic
for (i in 1:R){
k=sample(K,size=n,replace=FALSE) #permutation
x1=z[k]
y1=z[-k]
D[i]=(sum((fnx(x1)-fny(x1))^2)+sum((fnx(y1)-fny(y1))^2))/((m*n)/((m+n)^2)) #statistic
}
options(warn = 0)
p=mean(D>D0)
p
## ------------------------------------------------------------------------
set.seed(1)
library(RANN)
library(energy)
library(Ball)
library(boot)
#(1) Unequal variances and equal expectations
m=40 #number of replicates
R=999
k=3
Tn= function(z, ix, sizes,k) {
n1=sizes[1]
n2 = sizes[2]
n =n1 + n2
if(is.vector(z)) z=data.frame(z,0);
z =z[ix, ];
NN =nn2(data=z, k=k+1)
block1= NN$nn.idx[1:n1,-1]
block2=NN$nn.idx[(n1+1):n,-1]
i1 =sum(block1 < n1 + .5);
i2=sum(block2 > n1+.5)
(i1 + i2) / (k * n)
}
eqdist.nn=function(z,sizes,k){
boot.obj=boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p1.values=matrix(NA,m,3)
for(i in 1:m){
x=rnorm(10,4,4) #generate a sample x~N(4,4)
y=rnorm(10,4,1) #generate a sample y~n(4,1)
z=c(x,y)
n1=length(x)
n2=length(y)
N=c(n1,n2)
p1.values[i,1]=eqdist.nn(z,N,k)$p.value
p1.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
p1.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
eqdist.nn(z,N,k)$p.value
alpha=0.1
pow1=colMeans(p1.values<alpha)
pow1
names(pow1)=c("NN", "energy", "ball")
barplot(pow1)
#(2) Unequal variances and unequal expectations
m=40 #number of replicates
R=999
k=3
z=c(x,y) # pooled sample
eqdist.nn=function(z,sizes,k){
boot.obj = boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p2.values=matrix(NA,m,3)
for(i in 1:m){
x=rnorm(10,4,4) #generate a sample x~N(4,4)
y=rnorm(10,2,1) #generate a sample y~N(2,1)
z=c(x,y)
n1=length(x)
n2=length(y)
N=c(n1,n2)
p2.values[i,1]=eqdist.nn(z,N,k)$p.value
p2.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
p2.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow2=colMeans(p2.values<alpha)
pow2
names(pow2)=c("NN", "energy", "ball")
barplot(pow2)
#(3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
m=40 #number of replicates
R=999
k=3
z=c(x,y) #poled sample
eqdist.nn=function(z,sizes,k){
boot.obj =boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p3.values=matrix(NA,m,3)
for(i in 1:m){
x=rt(10,1) #generate a sample x~t(1)
y=c(rnorm(5,2,1),rnorm(5,4,4)) #generate a bimodel sample y which is a mixture of N(2,1) and N(4,4)
z=c(x,y)
n1=length(x)
n2=length(y)
N=c(n1,n2)
p3.values[i,1]=eqdist.nn(z,N,k)$p.value
p3.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
p3.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow3=colMeans(p3.values<alpha)
pow3
names(pow3)=c("NN", "energy", "ball")
barplot(pow3)
#(4)Unbalanced samples (say, 1 case versus 10 controls)
m=40 #number of replicates
R=999
k=3
z=c(x,y) #pooled sample
eqdist.nn =function(z,sizes,k){
boot.obj= boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p4.values=matrix(NA,m,3)
for(i in 1:m){
x=rexp(10,4) #generate a sample x~exp(4)
y=rf(15,1,2) #generate a sample y~F(1,2),and the numbers of them are different
z=c(x,y)
n1=length(x)
n2=length(y)
N=c(n1,n2)
p4.values[i,1]=eqdist.nn(z,N,k)$p.value
p4.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
p4.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow4=colMeans(p4.values<alpha)
pow4
names(pow4)=c("NN", "energy", "ball")
barplot(pow4)
## ------------------------------------------------------------------------
set.seed(1)
Gelman.Rubin=function(psi) {
# psi[i,j] is the statistic psi(X[i,1:j])
# for chain in i-th row of X
psi =as.matrix(psi)
n=ncol(psi)
k=nrow(psi)
psi.means=rowMeans(psi) #row means
B=n * var(psi.means) #between variance est.
psi.w=apply(psi, 1, "var") #within variances
W=mean(psi.w) #within est.
v.hat=W*(n-1)/n + (B/n) #upper variance est.
r.hat=v.hat / W #G-R statistic
return(r.hat)
}
f=function(theta,x){
f1=(0.5+theta/4)^x[1]
f2=((1-theta)/4)^x[2]
f3=((1-theta)/4)^x[3]
f4=(theta/4)^x[4]
if(theta>=0 && theta<1)
return(f1*f2*f3*f4)
else{
return(0)
}
}
unif.chain=function(m,p1){
#generates a Metropolis chain for unif(0,1)
#with unif(-1,1) proposal distribution
#and starting value X1
p=rep(0, m)
p[1]=p1
u=runif(m)
y=runif(m,-1,1) #the proposal distribution
for (i in 2:m) {
theta=p[i-1]+y[i]
num= f(theta,x)
den=f(p[i-1],x)
if(u[i]<=num/den)
p[i]=theta
else{
p[i]=p[i-1]
}
}
return(p)
}
m=5000 #the number of replicates
b=4700 #burn-in length
t=4 #number of chains to generate
x=c(125,18,20,34)
p0=c(0.2,0.4,0.6,0.8) #choose overdispersed initial values
#generate the chains
P=matrix(0,nrow=t,ncol=m)
for(i in 1:t)
P[i,]=unif.chain(m,p0[i])
#compute diagnostic statistics
psi=t(apply(P, 1, cumsum))
for (i in 1:nrow(psi))
psi[i,]=psi[i,] / (1:ncol(psi))
print(Gelman.Rubin(psi))
## ------------------------------------------------------------------------
set.seed(1)
library(rootSolve)
k=c(4:25,100,500,1000) #the degrees
n=length(k)
solution=numeric(n)
s=numeric(n)
for(i in 1:n){
s1=function(a){
q=sqrt((a^2)*(k[i]-1)/(k[i]-a^2))
pt(q,df=k[i]-1,lower.tail = FALSE,log.p = FALSE)
}
s2=function(a){
q=sqrt((a^2)*k[i]/(k[i]+1-a^2))
pt(q,df=k[i],lower.tail = FALSE,log.p = FALSE)
}
f=function(a){
s1(a)-s2(a)
}
solution[i]=uniroot(f,c(0.0001,sqrt(k[i])-0.0001))$root #the abscissa of the intersection points
s[i]=s1(solution[i]) #the ordinate of the intersection points
}
print(round(rbind(solution,s),5))
## ------------------------------------------------------------------------
set.seed(1)
f=function(x,theta,eta){
1/(theta*pi*(1+((x-eta)/theta)^2))
}
res=integrate(f,lower=-Inf,upper=6,rel.tol=.Machine$double.eps^0.25,theta=1,eta=1)
print(round(rbind(res$value,pcauchy(6,lower.tail = TRUE)),5))
## ------------------------------------------------------------------------
ABOexpect=function(nA,nB,p,q,r) {
nAA=nA * p / (p + 2*r)
nAO=nA - nAA
nBB=nB * q / (q + 2*r)
nBO=nB - nBB
N=list(AA=nAA,AO=nAO,BB=nBB,BO=nBO)
}
ABOmaximize=function(nAA,nAO,nBB,nBO,nAB,nO) {
p=(2 * nAA + nAO + nAB) / (2 * (nA + nB + nO + nAB))
q=(2 * nBB + nBO + nAB) / (2 * (nA + nB + nO + nAB))
r= 1.0 - p - q
L=list(p=p,q=q,r=r)
}
#initial guess
p=0.3
q=0.2
r=0.5
## Observed data (counts of people with the four blood groups)
nA =28
nB=24
nAB=41
nO=40
## Set up iteration
iter=1
diff=1
tol=0.0001 ## tolerance
while (diff > tol) {
E=ABOexpect(nA,nB,p,q,r)
M=ABOmaximize(E$AA,E$AO,E$BB,E$BO,nAB,nO)
diff=abs(M$p - p) + abs(M$q - q) + abs(M$r -r)
p=M$p
q=M$q
r=M$r
cat(sprintf("iter:%d, diff:%.2f\n",iter,diff))
iter=iter + 1
}
cat(sprintf("p=%.4f,q=%.4f, r=%.4f\n",p,q,r))
## ------------------------------------------------------------------------
set.seed(1)
library(base)
formulas=list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)
#for loops
out_formulas=vector('list', length(formulas))
for(i in seq_along(formulas)) {
out_formulas[[i]]=lm(formulas[[i]], data = mtcars)
}
out_formulas
#lapply
lapply(formulas, lm, mtcars)
## ------------------------------------------------------------------------
set.seed(1)
bootstraps=lapply(1:10, function(i) {
rows=sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
}
)
#for loops
out_bootstraps=vector('list', length(bootstraps))
for(i in seq_along(bootstraps)) {
out_bootstraps[[i]]=lm(mpg~disp, data = bootstraps[[i]])
}
out_bootstraps
#lapply
lapply(bootstraps, lm, formula = mpg~disp)
## ------------------------------------------------------------------------
set.seed(1)
rsq=function(mod) summary(mod)$r.squared
#Exercise 3
lapply(out_formulas, rsq)
#Exercise 4
lapply(out_bootstraps, rsq)
## ------------------------------------------------------------------------
set.seed(1)
p=numeric(100)
trials= replicate(100,t.test(rpois(10, 10), rpois(7, 10)),simplify = FALSE)
#for loops
for(i in 1:100){
p[i]=trials[[i]]$p.value
}
p
#sapply
x=c(1:100)
p1=sapply(trials,function(mod) mod$p.value)
p1
#anonymous function
p2=sapply(trials,'[[','p.value')
p2
#compare the consequence of the for loops and sapply
identical(p,p1)
identical(p,p2)
## ------------------------------------------------------------------------
set.seed(1)
library(parallel)
library(foreach)
#parallel
mcvMap=function(f, FUN.VALUE , ...) {
out=Map(f, ...)
vapply(out, FUN, FUN.VALUE)
}
#foreach
y=foreach(x=1:1000,.combine='rbind') %do% exp(x)
## ------------------------------------------------------------------------
set.seed(1)
options(warn = -1)
chisq_test2=function(x, y) {
n=sum(x) + sum(y)
ni._x=sum(x)
ni._y=sum(y)
chisq.statistic=0
for (i in seq_along(x)) {
n.j=x[i] + y[i]
expected_x=(ni._x*n.j)/n
expected_y=(ni._y*n.j)/n
chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x
chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y
}
chisq.statistic
}
x=c(3,8,4,9)
y=c(6,9,3,5)
a=matrix(c(x,y),4,2)
print(chisq_test2(x,y))
print(chisq.test(a)$statistic)
#compare the computation time of the two functions
microbenchmark::microbenchmark(chisq_test2(x,y),chisq.test(a))
## ------------------------------------------------------------------------
set.seed(1)
x=c(3,8,4,9)
y=c(6,9,3,5)
b=table(x,y)
summary(b)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.