inst/doc/intro.R

## ------------------------------------------------------------------------
MCestimate=function(n,x){
  u=runif(n)  #Generate iid Uniform(0,1) random numbers
  cdf=numeric(length(x))
  for(i in 1:length(x)){
    g=30*x[i]^3*u^2*(1-u*x[i])^2
    cdf[i]=mean(g)  #the estimated value of the cdf
  }
  phi=pbeta(x,3,3)  #the estimated values returned by the pbeta function
  print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function.
}
MCestimate(1000,c(1,2,3,4,5,6,7,8,9)/10)

## ------------------------------------------------------------------------
chisq_test=function(x, y) {
  n=sum(x) + sum(y)
  ni._x=sum(x)
  ni._y=sum(y)
  chisq.statistic=0
  for (i in seq_along(x)) {
    n.j=x[i] + y[i]
    expected_x=(ni._x*n.j)/n
    expected_y=(ni._y*n.j)/n
    chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x
    chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y
  }
  chisq.statistic
}
chisq_test(c(3,8,4,9),c(6,9,3,5))

## ------------------------------------------------------------------------
Jacknife.estimate=function(x,y){
  set.seed(1)
  n=length(x)
  theta.hat=cor(x,y) #the correlation estimate
  print(theta.hat)

  #compute the jackknife replicates, leave-one-out estimates
  theta.jack=numeric(n)
  for(i in 1:n){
    theta.jack[i]=cor(x[-i],y[-i])
  }
  bias=(n-1)*(mean(theta.jack)-theta.hat)
  print(bias)  #a jackknife estimate of the bias of the correlation statistic
  se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
  print(se)  #a jackknife estimate of the standard error of the correlation statistic
}
Jacknife.estimate(c(0.1,0.5,1,1.2,0.2,0.4,0.8,0.6,0.8,1.3),c(4,2.7,5.4,3.6,6.3,3.9,4.7,3.9,2.8,5.6))

## ------------------------------------------------------------------------
ID=c(1,2,3,4)
age=c(25,34,28,52)
status=c("poor","improved","excelled","poor")
patientdata<-data.frame(ID,age,status)
patientdata



## ------------------------------------------------------------------------
m1=matrix(1,nr=2,nc=2)
m2=matrix(2,nr=2,nc=2)
m=rbind(m1,m2)  #merge matrices
m
n=cbind(m1,m2)
n

## ------------------------------------------------------------------------
x=rnorm(10)
y=rnorm(10)
plot(x,y,xlab="Ten random values",ylab="Ten other values",xlim=c(-2,2),ylim=c(-2,2),main="How to customize a plot with R")

## ------------------------------------------------------------------------
#Solution 1
x=c(0,1,2,3,4)
p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities
cp=cumsum(p)
n=1000
r=numeric(n)
r=x[findInterval(runif(n),cp)+1] # the 1000 random numbers generated from the distribution of X
ct=as.vector(table(r))  # take the numbers of every value of X in R and form a vector
pr=ct/sum(ct)  #the empirical probabilities
s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function
st=as.vector(table(s))  # take the numbers of every value of X in R and form a vector
psample=st/sum(st)  #the empirical probabilities
comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample) 
#the comparison of the empirical,the theoretical and the sample probabilities
comparison
a=c(ct,st)
rnames=c(0,1,2,3,4)
cnames=c("ct","st")
mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function
mymatrix
barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function



## ------------------------------------------------------------------------
#Solution 2
p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities
x=rep(1,1000) # replicates the values of 1
for(i in 1:1000){
  u=runif(1)
  if(u<0.1) x[i]=0
  else{
    if(u<0.3) x[i]=1
    else{
      if(u<0.5) x[i]=2
      else{
        if(u<0.7) x[i]=3
        else x[i]=4
      }
    }
  }
}
# x :the 1000 random numbers generated from the distribution of X
ct=as.vector(table(x)) # take the numbers of every value of X in R and form a vector
pr=ct/sum(ct) #the empirical probabilities
s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function
st=as.vector(table(s))  # take the numbers of every value of X in R and form a vector
psample=st/sum(st)  #the empirical probabilities
comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample) 
#the comparison of the empirical,the theoretical and the sample probabilities
comparison
a=c(ct,st)
rnames=c(0,1,2,3,4)
cnames=c("ct","st")
mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function
mymatrix
barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function



## ------------------------------------------------------------------------
# beta(3,2) with pdf f(x)=12x^2(1-x),0<x<1
# g(x)=x^2,0<x<1 and c=12
n=1000
k=0 #counter for accepted
j=0 #iterations
y=numeric(n)
while(k<n){
  u=runif(1)
  j=j+1
  x=runif(1) #andom variate from g
  if(x^2*(1-x)>u){
    #we accept x
    k=k+1
    y[k]=x #deliver y=x
  }
}
j
# y:the 1000 random numbers generated from the Beta(a,b) distribution
plot(y) #the scatter plot of the 1000 random numbers generated from the Beta(a,b) distribution
hist(y,prob=TRUE,main=expression(f(x)==24*x*(1-x)-12*x^2)) #density histogram of sample
t=seq(0,1,.01)
lines(t,12*t^2*(1-t)) #density curve f(x)




## ------------------------------------------------------------------------
#generate a Poisson-Gamma mixture
n=1000
r=4
beta=2
lambda=rgamma(n, r, beta) #lambda is random
#now supply the sample of lambda???s as the exponential mean
y=rexp(n, lambda) #the 1000 random numbers generated from the mixture distribution
plot(y) # the scatter plot of the 1000 random numbers generated from the mixture distribution


## ------------------------------------------------------------------------
#write a function to compute aMonte Carlo estimate of the Beta(3,3)
my_MCestimate=function(n,x){
  u=runif(n)  #Generate iid Uniform(0,1) random numbers
  cdf=numeric(length(x)) 
  for(i in 1:length(x)){
    g=30*x[i]^3*u^2*(1-u*x[i])^2
    cdf[i]=mean(g)  #the estimated value of the cdf
  }
  phi=pbeta(x,3,3)  #the estimated values returned by the pbeta function
  print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function.
}
x=c(1,2,3,4,5,6,7,8,9)/10
n=1000
my_MCestimate(n,x) #the estimated values returned by the my_MCestimate function



## ------------------------------------------------------------------------
#Implement a function to generate samples from a $Rayleigh(\sigma)$ distribution,using antithetic variables.
MC.av=function(x, m= 10000, antithetic = TRUE) {
u=runif(m/2) #Generate random numbers u(1),u(2),...,u(m/2)~Unifrom(0,1)
if (!antithetic) v=runif(m/2) else
v=1-u
u=c(u, v)
sigma=10
cdf=numeric(length(x))
for (i in 1:length(x)) {
g=u*x[i]^2/(sigma^2)* exp(-(u * x[i])^2/2*(sigma)^2)
cdf[i]=mean(g)  #the estimated value of the cdf
}
cdf
}
x=seq(1.0,5.0,length=10)
set.seed(123)
MC1=MC.av(x,anti=FALSE) ##generate samples from the distribution without using antithetic variables
set.seed(123)
MC2=MC.av(x) #generate samples from the distribution by using antithetic variables
print(round(rbind(x,MC1,MC2),10))
c(sd(MC1),sd(MC2))
print((var(MC1)-var(MC2))/var(MC1)) #the percent reduction in variance  of $\frac{X+X'}{2}$  compared with$\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$

## ------------------------------------------------------------------------
m=10000
theta.hat=se=numeric(2)
g=function(x) {
((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1)
}
x=rnorm(m) #using f1
fg1=g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2))
theta.hat[1]=mean(fg1) #The first estimate
se[1]=sd(fg1) #The variance of the first estimate
x=rgamma(m,3,1) #using f2
x=x[x>1] #Limit the range of x,let x>1
fg2=g(x)/((1/2)*(x^2)*exp(-x))
theta.hat[2]=mean(fg2) #The second estimate
se[2]=sd(fg2) #The variance of the second estimate
rbind(theta.hat, se)

#Draw the curves of g, f1 and f2, then compare them
curve(g,1,5,ylab="",main="The comparsion of the curves of g,f1 and f2")
f1=function(x){
  (1/sqrt(2*pi))*exp(-(x^2)/2)
}
curve(f1,1,5,add=TRUE,col="red")
f2=function(x){
  (1/2)*(x^2)*exp(-x)
}
curve(f2,1,5,add=TRUE,col="green")

#Draw the curves of fg1 and fg2, and compare them
fg1=function(x){
  g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2))
}
curve(fg1,1,5,ylab="",main="The comparsion of the curves of fg1 and fg2")
fg2=function(x){
  g(x)/((1/2)*(x^2)*exp(-x))
}
curve(fg2,1,5,add=TRUE,col="red")

## ------------------------------------------------------------------------
m=10000
theta.hat=se=numeric(1)
g=function(x) {
((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1)
}
x=rgamma(m,3,1) #using f
x=x[x>1] #Limit the range of x,let x>1
fg=g(x)/((1/2)*(x^2)*exp(-x))
theta.hat[1]=mean(fg) #The second estimate
se[1]=sd(fg) #The variance of the second estimate
rbind(theta.hat, se)

## ------------------------------------------------------------------------
#the number of test n=1000
#estimate the G
n=1000
G.hat=matrix(0,nrow=3,ncol=n)
mean.G.hat=numeric(3)
median.G.hat=numeric(3)
deciles.G.hat=numeric(3)
x=matrix(0,nrow=3,ncol=n)
mu.hat=matrix(0,nrow=3,ncol=n)
t=numeric(n)
for(k in 1:3){
  for(i in 1:n){ 
    # i reprent the i-th test
    x[1,1:n]=sort(rlnorm(n)) #generated the random number of x1,...,xn from the standard lognormal distribution,and sort them
    x[2,1:n]=sort(runif(n)) #generated the random number of x1,...,xn from the uniform distribution,and sort them
    x[3,1:n]=sort(rbinom(n,1,0.1)) #generated the random number of x1,...,xn from the Bernoulli(0.1),and sort them
    mu.hat[k,i]=sum(x[k,1:n])/n  #mu replaced by the mean of x
    for(j in 1:n){
      t[j]=(2*j-n-1)*x[k,j]
    }
    G.hat[k,i]=sum(t[1:n])/(mu.hat[k,i]*(n^2)) #the estimate of G from the k-th distribution
  }
}

#Estimate the mean,median and deciles of G.hat if X is standard lognorm,uniform distribution and benoulli(0.1)
mean.G.hat=apply(G.hat,1,mean) #the mean of G.hat
median.G.hat=apply(G.hat,1,median) #the median of G.hat
deciles.G.hat[1]=quantile(G.hat[1,1:n],0.1) #the deciles of G.hat
deciles.G.hat[2]=quantile(G.hat[2,1:n],0.1)
deciles.G.hat[3]=quantile(G.hat[3,1:n],0.1) 
round(rbind(mean.G.hat,median.G.hat,deciles.G.hat),3) #output the mean,median and deciles of G.hat when x is standard lognormal,the uniform distribution and Bernoulli(0.1)

#construct the density histograms of the replicates
hist(G.hat[1,1:n],main="density histogram of G.hat when x is standard lognormal")
hist(G.hat[2,1:n],main="density histogram of G.hat when x is uniform distribution")
hist(G.hat[3,1:n],main="density histogram of G.hat when x is Bernoulli(0.1)")

## ------------------------------------------------------------------------

#construct a function to return the 95% confidence interval of gamma
CI=function(n,a,b,alpha=0.05){
  t=numeric(n)
  g=numeric(n)
  for(i in 1:n){
    x=sort(rlnorm(n,a,b))
    mu.hat=mean(x)
    for(j in 1:n){
      t[j]=(2*j-n-1)*x[j]
    }
    g[i]=mean(t)/(n*mu.hat)
  }
  gamma.hat=mean(g)
  sd.hat=sqrt(sum((g-mean(g))^2))/n #estimate the standard deviation of gamma.hat
  c=qnorm(1-alpha/2,0,1)*sd.hat
  CI1=gamma.hat-c #the confidence lower limit of gamma
  CI2=gamma.hat+c #the confidence upper limit of gamma
  c(CI1,CI2)
}
CI(1000,1,1,0.05) #construct an approximate 95% confidence interval of gamma when the parameters are given

#assess the coverage rate
d=matrix(0,nrow=100,ncol=2)
for(k in 1:100){
  d[k,1:2]=CI(1000,1,1,0.05)
}
CI1=d[1:100,1]
CI2=d[1:100,2]
#simulate the true value of gamma
gamma=2*pnorm(1/sqrt(2),0,1)-1  #I get this value in an established paper
gamma
m=0
for(i in 1:100){
  if(gamma<CI1[i]) m=m
  else{
    if(gamma>CI2[i]) m=m
    else{
      m=m+1
    }
  }
}
m
CP=m/100
CP #the coverage rate 


## ------------------------------------------------------------------------
n=1000
mu=0
sigma=0.9
repsig=400
repmean=100
system.time({
  G=function(x,mu=mean(x),n=length(x))sort(x)%*%(1:n*2-n-1)/n/n/mu
  mG=function(sig){mu=mean(rlnorm(repmean*n,0,sig));mean(sapply(1:repmean,function(...){G(rlnorm(n,0,sig),mu)}))}
  X=rlnorm(n,mu,sigma)
  print((CI_sig=sqrt(var(log(X))/qchisq(c(0.9875,0.0125),df=n-1)*(n-1))))
  rsig=sqrt(var(log(X))/qchisq(runif(repsig,0.0125,0.9875),df=n-1)*(n-1))
  Gs=sapply(rsig,mG)
  print(quantile(Gs,c(0.0125,0.9875)))
})


## ------------------------------------------------------------------------
library(MASS)
Sigma=matrix(c(1,2,1,9),2,2)  
A=mvrnorm(n=1000, rep(0, 2), Sigma) #the sampled distribution is bivariate normal
plot(A[1:1000,1],A[1:1000,2])
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="pearson",conf.level=0.95)
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="kendall",conf.level=0.95)
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="spearman",conf.level=0.95)

#compute the power
power=matrix(0,nrow=20,ncol=3)
for(i in 1:20) {
pvalues <- replicate(100, expr = {
  Sigma=matrix(c(1,2,1,9),2,2)
  A=mvrnorm(n=100, rep(0, 2), Sigma)
  ctest=cor.test(A[1:100,1],A[1:100,2],alternative="two.sided",method=c("pearson","kendall","spearman"),conf.level=0.95)
  ctest$p.value 
  } )
power[i,1:3]=mean(pvalues<=0.05)
}

## ------------------------------------------------------------------------
set.seed(1)
library(bootstrap) #for the law data
LSAT=law$LSAT
GPA=law$GPA
n=nrow(law)
system.time({
  theta.hat=cor(LSAT,GPA) #the correlation estimate
  print(theta.hat)

  #compute the jackknife replicates, leave-one-out estimates
  theta.jack=numeric(n)
  for(i in 1:n){
    theta.jack[i]=cor(LSAT[-i],GPA[-i])
  }
  bias=(n-1)*(mean(theta.jack)-theta.hat)
  print(bias)  #a jackknife estimate of the bias of the correlation statistic
  se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
  print(se)  #a jackknife estimate of the standard error of the correlation statistic
})


## ------------------------------------------------------------------------
set.seed(1)
library(boot)   #for boot and boot.ci
n=100 #the times of bootstrap replicates
system.time({
  x=c(3,5,7,18,43,85,91,98,100,130,230,487)
  boot.mean=function(x,i) mean(x[i])
  mean.hat=mean(x)
  boot.obj=boot(x,statistic = boot.mean,R=n)
  print(boot.ci(boot.obj,type=c("norm","basic","perc","bca")))
  
})


## ------------------------------------------------------------------------
set.seed(1)
data(scor, package = "bootstrap") #for the score data
library(MASS)
scor.m= matrix(as.numeric(t(scor)), 5, 88)
n=nrow(scor)  #sample size
mu=rowMeans(scor.m)
mu0=matrix(mu,nrow=5,ncol=n)
system.time({
  sigma.hat=((scor.m-mu0)%*%t(scor.m-mu0))/(n-1) # the MLE of covariance matrix
  ev.hat=sort(eigen(sigma.hat)$values,decreasing=T)
  theta.hat=ev.hat[1]/sum(ev.hat) 
  theta.hat #the estimate of theta

  #obtain the jackknife estimates of bias and standard error of theta.hat
  theta.jack=numeric(n)
  for(i in 1:n){
    scor.m.jack= scor.m[,-i]  #compute the jackknife replicates, leave-one-out estimates
    mu1=matrix(mu,nrow=5,ncol=n-1)
    sigma.jack=((scor.m.jack-mu1)%*%t(scor.m.jack-mu1))/(n-1)
    ev.jack=sort(eigen(sigma.jack)$values,decreasing=T)
    theta.jack[i]=ev.jack[1]/sum(ev.jack) 
  }
  theta.jack
  bias=(n-1)*(mean(theta.jack)-theta.hat)
  print(bias)  #a jackknife estimate of the bias 
  se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
  print(se)  #a jackknife estimate of the standard error 
  
})



## ------------------------------------------------------------------------


set.seed(1)
library(DAAG)
attach(ironslag)
n=length(magnetic) #in DAAG ironslag
m=n*(n-1)/2  # leave-two-out cross validation require training and validating the model m times
spe1=spe2=spe3=spe4=numeric(m)
# for leave-two-out cross validation
# fit models on leave-two-out samples
k=0
for(i in 1:(n-1)){
  for(j in (i+1):n){
    k=k+1
    y=magnetic[-c(i,j)] #leave-two-out 
    x=chemical[-c(i,j)]
    
    J1=lm(y ~ x)
    yhat11=J1$coef[1] + J1$coef[2] * chemical[i]   #the  estimate value of the i-th value
    yhat12=J1$coef[1] + J1$coef[2] * chemical[j]    #the  estimate value of the j-th value
    spe1[k]=((magnetic[i]-yhat11)^2+(magnetic[j]-yhat12)^2)/2   #the squared prediction error of the k-th 

    J2=lm(y ~ x + I(x^2))
    yhat21=J2$coef[1] + J2$coef[2] * chemical[i] +J2$coef[3] * chemical[i]^2
    yhat22=J2$coef[1] + J2$coef[2] * chemical[j] +J2$coef[3] * chemical[j]^2
    spe2[k]=((magnetic[i]-yhat21)^2+(magnetic[j]-yhat22)^2)/2

    J3=lm(log(y) ~ x)
    logyhat31 =J3$coef[1] + J3$coef[2] * chemical[i]
    logyhat32 =J3$coef[1] + J3$coef[2] * chemical[j]
    yhat31=exp(logyhat31)
    yhat32=exp(logyhat32)
    spe3[k]=((magnetic[i]-yhat31)^2+(magnetic[j]-yhat32)^2)/2

    J4=lm(log(y) ~ log(x))
    logyhat41 =J4$coef[1] + J4$coef[2] * log(chemical[i])
    logyhat42 =J4$coef[1] + J4$coef[2] * log(chemical[j])
    yhat41=exp(logyhat41)
    yhat42=exp(logyhat42)
    spe4[k]=((magnetic[i]-yhat41)^2+(magnetic[j]-yhat42)^2)/2
    
  }
}
c(mean(spe1),mean(spe2),mean(spe3),mean(spe4))


#According to the prediction error criterion, Model 2, the quadratic model, would be the best fit for the data.
J2
par(mfrow = c(1, 2)) #layout for graphs
plot(J2$fit, J2$res) #residuals vs fitted values
abline(0, 0) #reference line
qqnorm(J2$res) #normal probability plot
qqline(J2$res) #reference line
par(mfrow = c(1, 1)) #restore display

## ------------------------------------------------------------------------
set.seed(1)
library(nortest)
chickwts
x=sort(as.vector(chickwts$weight[chickwts$feed == "soybean"]))
y=sort(as.vector(chickwts$weight[chickwts$feed == "linseed"]))
n=length(x)
m=length(y)


R=999 #number of replicates
z=c(x, y) #pooled sample
K=1:26
D=numeric(R) #storage for replicates
fnx=ecdf(x)  #the ecdf of x
fny=ecdf(y)   #the ecdf of y
options(warn = -1)
D0=(sum((fnx(x)-fny(x))^2)+sum((fnx(y)-fny(y))^2))/((m*n)/((m+n)^2))  #statistic
for (i in 1:R){
  k=sample(K,size=n,replace=FALSE) #permutation
  x1=z[k]
  y1=z[-k]
  D[i]=(sum((fnx(x1)-fny(x1))^2)+sum((fnx(y1)-fny(y1))^2))/((m*n)/((m+n)^2))  #statistic
}

options(warn = 0)
p=mean(D>D0)
p

## ------------------------------------------------------------------------
set.seed(1)
library(RANN)
library(energy)
library(Ball)
library(boot)


#(1) Unequal variances and equal expectations

m=40 #number of replicates
R=999
k=3



Tn= function(z, ix, sizes,k) {
  n1=sizes[1]
  n2 = sizes[2]
  n =n1 + n2
  if(is.vector(z)) z=data.frame(z,0);
  z =z[ix, ];
  NN =nn2(data=z, k=k+1) 
  block1= NN$nn.idx[1:n1,-1]
  block2=NN$nn.idx[(n1+1):n,-1]
  i1 =sum(block1 < n1 + .5); 
  i2=sum(block2 > n1+.5)
  (i1 + i2) / (k * n)
}



eqdist.nn=function(z,sizes,k){
boot.obj=boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p1.values=matrix(NA,m,3)
for(i in 1:m){
  x=rnorm(10,4,4) #generate a sample x~N(4,4)
  y=rnorm(10,4,1) #generate a sample y~n(4,1)
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p1.values[i,1]=eqdist.nn(z,N,k)$p.value
  p1.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p1.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
eqdist.nn(z,N,k)$p.value
alpha=0.1
pow1=colMeans(p1.values<alpha)
pow1
names(pow1)=c("NN", "energy", "ball")
barplot(pow1)




#(2) Unequal variances and unequal expectations

m=40  #number of replicates
R=999
k=3
z=c(x,y) # pooled sample
eqdist.nn=function(z,sizes,k){
boot.obj = boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p2.values=matrix(NA,m,3)
for(i in 1:m){
  x=rnorm(10,4,4) #generate a sample x~N(4,4)
  y=rnorm(10,2,1) #generate a sample y~N(2,1)
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p2.values[i,1]=eqdist.nn(z,N,k)$p.value
  p2.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p2.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow2=colMeans(p2.values<alpha)
pow2
names(pow2)=c("NN", "energy", "ball")
barplot(pow2)



#(3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)

m=40 #number of replicates
R=999
k=3
z=c(x,y) #poled sample
eqdist.nn=function(z,sizes,k){
boot.obj =boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p3.values=matrix(NA,m,3)
for(i in 1:m){
  x=rt(10,1)  #generate a sample x~t(1)
  y=c(rnorm(5,2,1),rnorm(5,4,4)) #generate a bimodel  sample y which is a mixture of N(2,1) and N(4,4)
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p3.values[i,1]=eqdist.nn(z,N,k)$p.value
  p3.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p3.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow3=colMeans(p3.values<alpha)
pow3
names(pow3)=c("NN", "energy", "ball")
barplot(pow3)




#(4)Unbalanced samples (say, 1 case versus 10 controls)

m=40  #number of replicates
R=999
k=3
z=c(x,y) #pooled sample
eqdist.nn =function(z,sizes,k){
boot.obj= boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p4.values=matrix(NA,m,3)
for(i in 1:m){
  x=rexp(10,4)  #generate a sample x~exp(4)
  y=rf(15,1,2)  #generate a sample y~F(1,2),and the numbers of them are different
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p4.values[i,1]=eqdist.nn(z,N,k)$p.value
  p4.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p4.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow4=colMeans(p4.values<alpha)
pow4
names(pow4)=c("NN", "energy", "ball")
barplot(pow4)



## ------------------------------------------------------------------------
set.seed(1)
Gelman.Rubin=function(psi) {
# psi[i,j] is the statistic psi(X[i,1:j])
# for chain in i-th row of X
psi =as.matrix(psi)
n=ncol(psi)
k=nrow(psi)
psi.means=rowMeans(psi) #row means
B=n * var(psi.means) #between variance est.
psi.w=apply(psi, 1, "var") #within variances
W=mean(psi.w) #within est.
v.hat=W*(n-1)/n + (B/n) #upper variance est.
r.hat=v.hat / W #G-R statistic
return(r.hat)
}


  f=function(theta,x){
    f1=(0.5+theta/4)^x[1]
    f2=((1-theta)/4)^x[2]
    f3=((1-theta)/4)^x[3]
    f4=(theta/4)^x[4]
    if(theta>=0 && theta<1)
      return(f1*f2*f3*f4)
    else{
      return(0)
      }
  }
  
  
unif.chain=function(m,p1){
  #generates a Metropolis chain for unif(0,1)
  #with unif(-1,1) proposal distribution
  #and starting value X1
  p=rep(0, m)
  p[1]=p1
  u=runif(m)
  y=runif(m,-1,1) #the proposal distribution
  for (i in 2:m) {
    theta=p[i-1]+y[i]
    num= f(theta,x)
    den=f(p[i-1],x)
    if(u[i]<=num/den)
      p[i]=theta
    else{
      p[i]=p[i-1]
    }
    }
return(p)
}
  
  
m=5000 #the number of replicates
b=4700 #burn-in length
t=4 #number of chains to generate
x=c(125,18,20,34)
p0=c(0.2,0.4,0.6,0.8) #choose overdispersed initial values


#generate the chains
P=matrix(0,nrow=t,ncol=m)
for(i in 1:t)
  P[i,]=unif.chain(m,p0[i])

#compute diagnostic statistics
psi=t(apply(P, 1, cumsum))
for (i in 1:nrow(psi))
psi[i,]=psi[i,] / (1:ncol(psi))
print(Gelman.Rubin(psi))






## ------------------------------------------------------------------------
set.seed(1)
library(rootSolve)
k=c(4:25,100,500,1000)  #the degrees
n=length(k)
solution=numeric(n)
s=numeric(n)
for(i in 1:n){
  s1=function(a){
    q=sqrt((a^2)*(k[i]-1)/(k[i]-a^2))
    pt(q,df=k[i]-1,lower.tail = FALSE,log.p = FALSE)
  }
  s2=function(a){
    q=sqrt((a^2)*k[i]/(k[i]+1-a^2))
    pt(q,df=k[i],lower.tail = FALSE,log.p = FALSE)
  }
  f=function(a){
    s1(a)-s2(a)
  }
  solution[i]=uniroot(f,c(0.0001,sqrt(k[i])-0.0001))$root  #the abscissa of the intersection points
  s[i]=s1(solution[i])  #the ordinate of the intersection points
  }
print(round(rbind(solution,s),5))




## ------------------------------------------------------------------------
set.seed(1)
f=function(x,theta,eta){
  1/(theta*pi*(1+((x-eta)/theta)^2))
}
res=integrate(f,lower=-Inf,upper=6,rel.tol=.Machine$double.eps^0.25,theta=1,eta=1)
print(round(rbind(res$value,pcauchy(6,lower.tail = TRUE)),5))



## ------------------------------------------------------------------------
ABOexpect=function(nA,nB,p,q,r) {
    nAA=nA * p / (p + 2*r)
    nAO=nA - nAA
    nBB=nB * q / (q + 2*r)
    nBO=nB - nBB
    N=list(AA=nAA,AO=nAO,BB=nBB,BO=nBO)
  }
  
  ABOmaximize=function(nAA,nAO,nBB,nBO,nAB,nO) {
    p=(2 * nAA + nAO + nAB) / (2 * (nA + nB + nO + nAB))
    q=(2 * nBB + nBO + nAB) / (2 * (nA + nB + nO + nAB))
    r= 1.0 - p - q
    L=list(p=p,q=q,r=r)
  }
  
  #initial guess
  p=0.3
  q=0.2
  r=0.5
  ## Observed data (counts of people with the four blood groups)
  nA =28
  nB=24
  nAB=41
  nO=40
  
  ## Set up iteration
  iter=1
  diff=1
  tol=0.0001 ## tolerance
  
  while (diff > tol) {
    E=ABOexpect(nA,nB,p,q,r)
    M=ABOmaximize(E$AA,E$AO,E$BB,E$BO,nAB,nO)
    diff=abs(M$p - p) + abs(M$q - q) + abs(M$r -r)
    p=M$p
    q=M$q
    r=M$r
    cat(sprintf("iter:%d, diff:%.2f\n",iter,diff))
    iter=iter + 1
  }
  
  cat(sprintf("p=%.4f,q=%.4f, r=%.4f\n",p,q,r))
  

  

## ------------------------------------------------------------------------
set.seed(1)
library(base)
 formulas=list(
      mpg ~ disp,
      mpg ~ I(1 / disp),
      mpg ~ disp + wt,
      mpg ~ I(1 / disp) + wt
    )
 
 
#for loops
out_formulas=vector('list', length(formulas))
for(i in seq_along(formulas)) {
  out_formulas[[i]]=lm(formulas[[i]], data = mtcars)
  }
out_formulas
 
 
#lapply
lapply(formulas, lm, mtcars)

 
 

## ------------------------------------------------------------------------
set.seed(1)
bootstraps=lapply(1:10, function(i) {
  rows=sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ]
}
  )
 
 
#for loops
out_bootstraps=vector('list', length(bootstraps))
for(i in seq_along(bootstraps)) {
  out_bootstraps[[i]]=lm(mpg~disp, data = bootstraps[[i]])
  }
out_bootstraps


#lapply
lapply(bootstraps, lm, formula = mpg~disp)



## ------------------------------------------------------------------------
set.seed(1)
rsq=function(mod) summary(mod)$r.squared

#Exercise 3
lapply(out_formulas, rsq)

#Exercise 4
lapply(out_bootstraps, rsq)



## ------------------------------------------------------------------------
set.seed(1)
p=numeric(100)
trials= replicate(100,t.test(rpois(10, 10), rpois(7, 10)),simplify = FALSE)

#for loops
for(i in 1:100){
  p[i]=trials[[i]]$p.value
}
p


#sapply
x=c(1:100)
p1=sapply(trials,function(mod) mod$p.value)
p1

#anonymous function
p2=sapply(trials,'[[','p.value')
p2


#compare the consequence of the for loops and sapply
identical(p,p1) 
identical(p,p2)



## ------------------------------------------------------------------------
set.seed(1)
library(parallel)
library(foreach)

#parallel
mcvMap=function(f, FUN.VALUE , ...) {
    out=Map(f, ...)
    vapply(out, FUN, FUN.VALUE)
}


#foreach
y=foreach(x=1:1000,.combine='rbind') %do% exp(x)




## ------------------------------------------------------------------------
set.seed(1)
options(warn = -1)
chisq_test2=function(x, y) {
    n=sum(x) + sum(y)
    ni._x=sum(x)
    ni._y=sum(y)
    chisq.statistic=0
    for (i in seq_along(x)) {
      n.j=x[i] + y[i]
      expected_x=(ni._x*n.j)/n
      expected_y=(ni._y*n.j)/n
      chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x
      chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y
    }
    chisq.statistic
  }

x=c(3,8,4,9)
y=c(6,9,3,5)
a=matrix(c(x,y),4,2)
print(chisq_test2(x,y)) 
print(chisq.test(a)$statistic) 
  


#compare the computation time of the two functions

microbenchmark::microbenchmark(chisq_test2(x,y),chisq.test(a))






## ------------------------------------------------------------------------
set.seed(1)
x=c(3,8,4,9)
y=c(6,9,3,5)
b=table(x,y)
summary(b)
fjv587/StatComp18082 documentation built on May 29, 2019, 8:34 a.m.