dist/doc/Homework.R

## -----------------------------------------------------------------------------
set.seed(0)
n<-20
a1<-vector()
a2<-vector()
a3<-vector()#produce 3 empty vector to store the figures
#uniform
a1<-runif(n,1,2)
#normal
a2<-rnorm(n,1,4)
#exponential
a3<-rexp(n,rate = 1)
a4<-list(a1,a2,a3)#put all the figures into a list names a4
names(a4)<-c("a1","a2","a3")#name the elements of the list in order to read more convenient
a4

## -----------------------------------------------------------------------------
library(graphics)
x<-1:n
plot(x,y=a1,type="p",main = "Uniform",xlab = "Number",ylim = c(min(a1)-1,max(a1)+1))#picture of Uniform
lines(x,a1,type="l",col=1)#black
plot(x,y=a2,type="p",main = "Normal",xlab = "Number",ylim = c(min(a2)-1,max(a2)+1))#picture of Normal
lines(x,a2,type="l",col=2)#red
plot(x,y=a3,type="p",main = "Exponential",xlab = "Number",ylim = c(min(a3)-1,max(a3)+1))#picture of Exponential
lines(x,a3,type="l",col=3)#green

## -----------------------------------------------------------------------------
a_1<-sample(a1,size = 10,replace = FALSE)
a_2<-sample(a2,size = 10,replace = FALSE)
a_3<-sample(a3,size = 10,replace = FALSE)
a_4<-list(a_1,a_2,a_3)
names(a_4)<-c("a_1","a_2","a_3")
a_4

## -----------------------------------------------------------------------------
data<-data.frame(unif = a_1,norm = a_2, exp = a_3)
data
knitr::kable (data)#produce a table
#write.table(data,'A-21038-2021-09-16.txt',quote = F,row.names = F)

## -----------------------------------------------------------------------------
apply(data,2,mean)
apply(data,2,var)

## -----------------------------------------------------------------------------
set.seed(0)
n<-1000
u<-runif(n)# generate the samples of uniform
sigma<-c(1,2,3,4)
for(i in 1:4){
  x<-sqrt(-2*sigma[i]**2*log(1-u)) # generate samples for each sigma
  hist(x,prob=T,main=paste("The value of sigma=",sigma[i]))# plot the histogram of each sigma
  y<-seq(0,15,.01)
  lines(y,y/(sigma[i]**2)*exp(-y**2/(2*sigma[i]**2)))# plot the pdf line
}

## -----------------------------------------------------------------------------
set.seed(1)# fix the random values of samples
n<-1e3# the number of samples
X1<-rnorm(n)# N(0,1)
X2<-rnorm(n,3,1)# N(3,1)
r<-sample(c(0,1),n,replace = T,prob=c(0.25,0.75))
Z<-r*X1+(1-r)*X2
hist(Z,prob=T)# show the density of Z
y<-seq(-8,8,0.01)
lines(y,0.75*dnorm(y,0,1)+(1-0.75)*dnorm(y,3,1))# draw a line show the mixture of X1 and X2

## -----------------------------------------------------------------------------
p1<-seq(0,1,0.05)
k<-length(p1)# the number of the element of p1
for(i in 1:k){
  r<-sample(c(0,1),n,replace = T,prob=c(1-p1[i],p1[i]))
  Z<-r*X1+(1-r)*X2
  hist(Z,prob=T,main = paste("p1 = ",p1[i]))
  y<-seq(-6,6,0.01)
  lines(y,p1[i]*dnorm(y,0,1)+(1-p1[i])*dnorm(y,3,1))# draw lines in every picture
}

## -----------------------------------------------------------------------------
set.seed(2)
n<-1e4
N<-rpois(n,10)# generate Poisson distribution which lambda = 10
X<-vector()
for(i in 1:n){
  Y<-rgamma(N[i],4,2)# generate y according to N(t)
  X[i]<-sum(Y)
}
mean(X)# expectation
var(X)# var

## -----------------------------------------------------------------------------
set.seed(3)
n<-1e4
N<-rpois(n,20)# generate Poisson distribution which lambda = 20
X<-vector()
for(i in 1:n){
  Y<-rgamma(N[i],9,3)# generate y according to N(t)
  X[i]<-sum(Y)
}
mean(X)# expectation
var(X)# var

## -----------------------------------------------------------------------------
set.seed(4)
n<-1e4
N<-rpois(n,30)# generate Poisson distribution which lambda = 30
X<-vector()
for(i in 1:n){
  Y<-rgamma(N[i],16,4)# generate y according to N(t)
  X[i]<-sum(Y)
}
mean(X)# expectation
var(X)# var

## -----------------------------------------------------------------------------
#define a function called mybeta to estimate the cdf of beta distribution, and the parameters alpha and beta should be given as whatever you need.
mybeta<-function(x,alpha,beta){
  n<-1e4 # the number we generate how many samples
  U<-runif(n,0,x)
  value<-mean(30*x*U**(alpha-1)*(1-U)**(beta-1))
  value
}

## -----------------------------------------------------------------------------
alpha<-3;beta<-3
X<-0.1*c(1:9)
m<-length(X)
pvalue1<-vector()# the value of F(x) which are estimated by mento carlo
pvalue2<-vector()# the value of pbeta
for(i in 1:m){
  pvalue1[i]<-mybeta(X[i],alpha = alpha,beta = beta)
}
for(i in 1:m){
  pvalue2[i]<-pbeta(X[i],alpha,beta)
}
compare<-matrix(c(pvalue1,pvalue2,pvalue1-pvalue2),ncol=3)# compare the pvalue generated by different method
colnames(compare)<-c('pvalue1','pvalue2','difference')
compare

## -----------------------------------------------------------------------------
myrayleigh<-function(n,sigma,optimal=c(1,2)){
  if(optimal==1){
    x<-vector()
    u<-runif(n)
    x<-sqrt(-2*sigma**2*log(1-u))
  }
  if(optimal==2){
    x<-vector()
    u<-runif(n)
    x<-(sqrt(-2*sigma**2*log(1-u))+sqrt(-2*sigma**2*log(u)))/2
  }
  x
}

## -----------------------------------------------------------------------------
m<-1e4
#variance of (X+X')/2
v1<-var(myrayleigh(m,1,2))
#variance of (X1+X2)/2
X1<-myrayleigh(m,1,1)
X2<-myrayleigh(m,1,1)
cov<-cov(X1,X2)# verify the indenpedence
v2<-var((X1+X2)/2)
print(paste("X1 and X2 are",ifelse(cov<0.01&&cov>-0.01,'independent','dependent')))
print(paste('the value of v1 is ',v1))
print(paste('the value of v2 is ',v2))
print(paste('the percent reduction is ',(v2-v1)/v2))

## -----------------------------------------------------------------------------
x<-seq(1,10,0.01)
y<-seq(1,10,0.01)
plot(x,y,type="n",ylim=c(0,4))
lines(x,x/(2*exp(1)*sqrt(2*pi))*exp(x^2/2),col=2)
lines(x,x/sqrt(2*exp(1)*pi),col=3)
legend(c(8,9),c(3,4),c('g1','g2'),c(2,3))

## -----------------------------------------------------------------------------
f1_i<-function(n){# inverse function of f1
  x<-vector()
  u<-runif(n)
  x<-sqrt(1-log(1-u))
  x
}
g1<-function(x){# function g1
  y<-vector()
  n<-length(x)
  for(i in 1:n){
    y[i]<-x[i]/(2*exp(1)*sqrt(2*pi))*exp(x[i]**2/2)
  }
  y
}
n<-1e4
v1<-var(g1(f1_i(n)))/n
v1

## -----------------------------------------------------------------------------
f2_i<-function(n){# inverse function of f2
  x<-vector()
  u<-runif(n)
  x<-sqrt(1-2*log(1-u))
  x
}
g2<-function(x){# function g2
  y<-vector()
  n<-length(x)
  for(i in 1:n){
   y[i]<-x[i]/sqrt(2*exp(1)*pi) 
  }
  y
}
n<-1e4
v2<-var(g2(f2_i(n)))/n
v2

## -----------------------------------------------------------------------------
f1_i<-function(n){# inverse function of f1
  x<-vector()
  u<-runif(n)
  x<-sqrt(1-log(1-u))
  x
}
g1<-function(x){# function g1
  y<-vector()
  n<-length(x)
  for(i in 1:n){
    y[i]<-x[i]/(2*exp(1)*sqrt(2*pi))*exp(x[i]**2/2)
  }
  y
}
n<-1e4
value1<-mean(g1(f1_i(n)))
value1

## -----------------------------------------------------------------------------
f2_i<-function(n){# inverse function of f
  x<-vector()
  u<-runif(n)
  x<-sqrt(1-2*log(1-u))
  x
}
g2<-function(x){# function g2
  y<-vector()
  n<-length(x)
  for(i in 1:n){
   y[i]<-x[i]/sqrt(2*exp(1)*pi) 
  }
  y
}
n<-1e4
value2<-mean(g2(f2_i(n)))
value2

## -----------------------------------------------------------------------------
# t-interval
n<-20# the number of sample
alpha<-0.05
UCL<-matrix(nrow=1000,ncol=2)
# chi-squre
for(i in 1:1000){# we will sample 1000 times
   x<-rchisq(n,df=2)
   UCL[i,1]<-mean(x)-sd(x)*qt(1-alpha/2,n-1)/sqrt(n)# the lower
   UCL[i,2]<-mean(x)+sd(x)*qt(1-alpha/2,n-1)/sqrt(n)# the upper
}
head(UCL)# observe some interval
sum<-0# initial sum=0
for(i in 1:1000){
   if(UCL[i,1]<2&&UCL[i,2]>2){
      sum<-sum+1
   }
   else{
      sum<-sum
   }
}# if the mean is not in the interval, then the sum will not change, otherwise it will plus 1
print("the total sum and the coverage probablity:")
sum# the total number which means the mean of the distribution is in the interval
sum/1000
# Normal
for(i in 1:1000){# we will sample 1000 times
   x<-rnorm(n,0,2)
   UCL[i,1]<-mean(x)-sd(x)*qt(1-alpha/2,n-1)/sqrt(n)# the lower
   UCL[i,2]<-mean(x)+sd(x)*qt(1-alpha/2,n-1)/sqrt(n)# the upper
}
head(UCL)# observe some interval
sum<-0# initial sum=0
for(i in 1:1000){
   if(UCL[i,1]<0&&UCL[i,2]>0){
      sum<-sum+1
   }
   else{
      sum<-sum
   }
}# if the mean is not in the interval, then the sum will not change, otherwise it will plus 1
print("the total sum and the coverage probablity:")
sum# the total number which means the mean of the distribution is in the interval
sum/1000

## -----------------------------------------------------------------------------
# interval for variance
# Normal
n<-20
alpha<-0.05
UCL<-replicate(1000,expr={
   x<-rnorm(n,0,2)
   (n-1)*var(x)/qchisq(alpha,df=n-1)
})
mean(UCL>4)
# chi-square
UCL<-replicate(1000,expr={
   x<-rchisq(n,df=2)
   (n-1)*var(x)/qchisq(alpha,df=n-1)
})
mean(UCL>4)

## -----------------------------------------------------------------------------
# write a function to solve this problem, and this function can generate Type I error rate of these three distributions
power<-function(n,alpha,mu0,m,optimal=c(1,2,3)){
   p<-numeric(m)
   if(optimal==1){
     for(i in 1:m){
        x<-rchisq(n,1)
        ttest<-t.test(x,mu=mu0)
        p[i]<-ttest$p.value
     }
   }
   if(optimal==2){
     for(i in 1:m){
        x<-runif(n,0,2)
        ttest<-t.test(x,mu=mu0)
        p[i]<-ttest$p.value
     }
   }
   if(optimal==3){
     for(i in 1:m){
        x<-rexp(n,1)
        ttest<-t.test(x,mu=mu0)
        p[i]<-ttest$p.value
     }
   }
   p.hat<-mean(p<alpha)
   se.hat<-sqrt(p.hat*(1-p.hat)/m)
   print(c(p.hat,se.hat))
}

## -----------------------------------------------------------------------------
set.seed(1)
n<-20
alpha<-0.05
mu0<-1
m<-1000
#chi-square
print("The result of chi-square")
power(n,alpha,mu0,m,1)
#uniform
print("The result of uniform")
power(n,alpha,mu0,m,2)
#exponential
print("The result of exponential")
power(n,alpha,mu0,m,3)

## ---- echo=FALSE--------------------------------------------------------------
data<-data.frame(l=c(6510,6760,13280),m=c(3490,3240,6730),n=c(10000,10000,20000))
rownames(data)<-c("method_1","method_2","total number")
colnames(data)<-c(paste("p<0.05",' '),paste(' ',"p>0.05"),"total number")
knitr::kable (data)#produce a table

## ---- echo=FALSE--------------------------------------------------------------
20000*(6510*3240-6760*3490)^2/(13280*6730*10000^2)
qchisq(0.95,1)

## -----------------------------------------------------------------------------
sk<-function(data,alpha){
  if(is.matrix(data)==1){# whether the formula of the data is matrix
    d<-ncol(data)# dim
    n<-nrow(data)# number of data
    cv<-qchisq(1-alpha,d*(d+1)*(d+2)/6)# quantil
    colmean<-apply(data,2,mean)# the mean of every col
    sigma<-(n-1)*cov(data)/n# the maximum likelihood of corvariance
    b<-t(t(data)-colmean)%*%solve(sigma)%*%(t(data)-colmean)
    b_1d<-sum(b^{3})/n^2
    test<-n*b_1d/6
    as.integer(test>=cv)
  }
  else{
    print("Sorry that the data is not a matrix")
  }
}

## -----------------------------------------------------------------------------
library(MASS)
set.seed(1)
alpha<-0.05
mu <- c(0,0)
sigma <- matrix(c(1,0,0,1),nrow=2,ncol=2)
m<-5000
N<-c(10, 20, 30, 50, 100, 500)
#m: number of replicates; n: sample size
p.reject<-numeric(length(N))
num.reject<-numeric(m)
for(i in 1:length(N)){
   for(j in 1:m){
     data<-mvrnorm(N[i],mu,sigma)
     num.reject[j]<-sk(data,alpha)
   }
  p.reject[i]<-mean(num.reject)
}
p.reject

## -----------------------------------------------------------------------------
library(MASS)
alpha<-0.05
n<-30
m<-2500
epsilon<-c(seq(0,0.15,0.01),seq(0.15,1,0.05))
N<-length(epsilon)
power<-numeric(N)
mu<-c(0,0)
sd1<-matrix(c(1,0,0,1),nrow=2)
sd2<-10*sd1
for(i in 1:N){# for each epsilon
  e<-epsilon[i]
  sktests<-numeric(m)
  for(j in 1:m){# for each replicate
    c<-sample(c(0,1),replace = T,size=n,prob=c(1-e,e))
    data<-(1-c)*mvrnorm(n,mu,sd1)+c*mvrnorm(n,mu,sd2)
    sktests[j]<-sk(data,alpha)
  }
  power[i]<-mean(sktests)
}
plot(epsilon, power, type = "b",xlab = bquote(epsilon), ylim = c(0,1)) 
abline(h = .05, lty = 3)
se <- sqrt(power * (1-power) / m) #add standard errors 
lines(epsilon, power+se, lty = 3)
lines(epsilon, power-se, lty = 3)

## -----------------------------------------------------------------------------
library(bootstrap)
n<-nrow(scor)
sigma_hat<-(n-1)*cov(scor)/n# MLE of sigma_hat
lambda_hat<-eigen(sigma_hat)$values
theta_hat<-lambda_hat[1]/sum(lambda_hat)
theta_hat# the value of theta_hat

## -----------------------------------------------------------------------------
set.seed(1)
B<-5000
theta_star<-numeric(B)
for(b in 1:B){
  i<-sample(1:n,size = n, replace = T)
  scor1<-scor[i,]
  sigma_star<-(n-1)*cov(scor1)/n
  lambda_star<-eigen(sigma_star)$values
  theta_star[b]<-lambda_star[1]/sum(lambda_star)# the value of theta_star
}
round(c(original=theta_hat, bias=mean(theta_star)-theta_hat, se=sd(theta_star)),3)

## -----------------------------------------------------------------------------
theta_jack<-numeric(n)
for(i in 1:n){
  scor2<-scor[-i,]# delete the ith row
  sigma_jack<-(n-2)*cov(scor2)/(n-1)# MLE, the row number of data is n-1
  lambda_jack<-eigen(sigma_jack)$values
  theta_jack[i]<-lambda_jack[1]/sum(lambda_jack)
}
round(c(original=theta_hat,bias.jack=(n-1)*(mean(theta_jack)-theta_hat),
        se.jack=sqrt((n-1)*mean((theta_jack-theta_hat)^2))),3)  

## -----------------------------------------------------------------------------
library(boot)
set.seed(2)
theta.hat<-function(X,i){
  n<-nrow(X)
  lambda<-eigen((n-1)*cov(X[i,])/n)$values
  theta.boot<-lambda[1]/sum(lambda)
  return(theta.boot)
}
theta.obj<-boot(data=scor,statistic = theta.hat,R=5000)
round(c(original=theta.obj$t0,bias=mean(theta.obj$t)-theta.obj$t0,
        se=sd(theta.obj$t)),3)
print(boot.ci(theta.obj,conf=0.95,type=c("perc","bca")))
#percentile
alpha<-c(0.025,0.975)
print("caculate teh percentile interval:")
print(quantile(theta.obj$t,alpha,type=6))

## -----------------------------------------------------------------------------
#BCa
#function boot.BCa
boot.BCa<-function(x,t0,t,stat,conf=0.95){
  # bootstrap with BCa bootstrap confidence interval 
  # t0 is the observed statistic
  # t is the vector of bootstrap replicates
  # stat is the function to compute the statistic
  x <- as.matrix(x)
  n <- nrow(x) #observations in rows 
  N <- 1:n
  alpha <- (1 + c(-conf, conf))/2 
  zalpha <- qnorm(alpha)
  # the bias correction factor
  z0 <- qnorm(sum(t < t0) / length(t))
  # the acceleration factor (jackknife est.) 
  t.jack <- numeric(n)
  for (i in 1:n) {
     J <- N[1:(n-1)]
     t.jack[i] <- stat(x[-i, ], J) 
     }
  L <- mean(t.jack) - t.jack
  a <- sum(L^3)/(6 * sum(L^2)^1.5)
  # BCa conf. limits
  adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha))) 
  limits <- quantile(t, adj.alpha, type=6) 
  return(list("est"=t0, "BCa"=limits))
}
boot.BCa(scor,t0=theta.obj$t0,t=theta.obj$t,stat=theta.hat)

## -----------------------------------------------------------------------------
sk<-function(x,i){# statistic
  meanx<-mean(x[i])
  m3<-mean((x[i]-meanx)^3)
  m2<-mean((x[i]-meanx)^2)
  return(m3/m2^1.5)
}
m<-2000# number of mento carlo
n<-100# number of random variables
sktests1<-numeric(m)# normal
sktests1left<-numeric(m)# the mean < the left of normal
sktests2<-numeric(m)# basic
sktests2left<-numeric(m)# the mean < the left of basic
sktests3<-numeric(m)# percentile
sktests3left<-numeric(m)# the mean < the left of percentile
sk_norm<-0
sk_chisq<-sqrt(8/5)
for(i in 1:m){
  x<-rnorm(n)
  boot.obj<-boot(data=x,statistic = sk, R=2000)# bootstrap
  ci<-boot.ci(boot.obj,conf=0.95,type=c("norm","basic","perc"))
  interval1<-ci$normal[2:3]#normal
  interval2<-ci$basic[4:5]#basic
  interval3<-ci$percent[4:5]#percentile
  sktests1[i]<-as.integer(sk_norm>interval1[1]&&
                         sk_norm<interval1[2])
  sktests1left[i]<-as.integer(sk_norm<interval1[1])
  sktests2[i]<-as.integer(sk_norm>interval2[1]&&
                         sk_norm<interval2[2])
  sktests2left[i]<-as.integer(sk_norm<interval2[1])
  sktests3[i]<-as.integer(sk_norm>interval3[1]&&
                         sk_norm<interval3[2])
  sktests3left[i]<-as.integer(sk_norm<interval3[1])
}
coverage_rate1<-mean(sktests1)
coverage_rate2<-mean(sktests2)
coverage_rate3<-mean(sktests3)
left_rate1<-mean(sktests1left)
left_rate2<-mean(sktests2left)
left_rate3<-mean(sktests3left)
right_rate1<-1-mean(sktests1left)-mean(sktests1)
right_rate2<-1-mean(sktests2left)-mean(sktests2)
right_rate3<-1-mean(sktests3left)-mean(sktests3)
print("the coverage rate")
round(c(normal=coverage_rate1,basic=coverage_rate2,percentile=coverage_rate3),4)
print("the proportion of times theta the confidence intervals miss on the left")
round(c(normal=left_rate1,basic=left_rate2,percentile=left_rate3),4)
print("the proportion of times theta the confidence intervals miss on the right")
round(c(normal=right_rate1,basic=right_rate2,percentile=right_rate3),4)

## -----------------------------------------------------------------------------
m<-2000# number of mento carlo
n<-100# number of random variables
sktests1<-numeric(m)# normal
sktests1left<-numeric(m)# the mean < the left of normal
sktests2<-numeric(m)# basic
sktests2left<-numeric(m)# the mean < the left of basic
sktests3<-numeric(m)# percentile
sktests3left<-numeric(m)# the mean < the left of percentile
for(i in 1:m){
  x<-rchisq(n,df=5)
  boot.obj<-boot(data=x,statistic = sk, R=2000)# bootstrap
  ci<-boot.ci(boot.obj,conf=0.95,type=c("norm","basic","perc"))
  interval1<-ci$norm[2:3]#normal
  interval2<-ci$basic[4:5]#basic
  interval3<-ci$perc[4:5]#percentile
  sktests1[i]<-as.integer(sk_chisq>interval1[1]&&
                         sk_chisq<interval1[2])
  sktests1left[i]<-as.integer(sk_chisq<interval1[1])
  sktests2[i]<-as.integer(sk_chisq>interval2[1]&&
                         sk_chisq<interval2[2])
  sktests2left[i]<-as.integer(sk_chisq<interval2[1])
  sktests3[i]<-as.integer(sk_chisq>interval3[1]&&
                         sk_chisq<interval3[2])
  sktests3left[i]<-as.integer(sk_chisq<interval3[1])
}
coverage_rate1<-mean(sktests1)
coverage_rate2<-mean(sktests2)
coverage_rate3<-mean(sktests3)
left_rate1<-mean(sktests1left)
left_rate2<-mean(sktests2left)
left_rate3<-mean(sktests3left)
right_rate1<-1-mean(sktests1left)-mean(sktests1)
right_rate2<-1-mean(sktests2left)-mean(sktests2)
right_rate3<-1-mean(sktests3left)-mean(sktests3)
print("the coverage rate")
round(c(normal=coverage_rate1,basic=coverage_rate2,percentile=coverage_rate3),4)
print("the proportion of times theta the confidence intervals miss on the left")
round(c(normal=left_rate1,basic=left_rate2,percentile=left_rate3),4)
print("the proportion of times theta the confidence intervals miss on the right")
round(c(normal=right_rate1,basic=right_rate2,percentile=right_rate3),4)

## -----------------------------------------------------------------------------
set.seed(1)
# generate the samples
X<-rnorm(20)
Y<-rnorm(20,100,1)
R<-999 # number of replicates
Z<-c(X,Y)# pooled sample
k<-length(Z)
K<-1:k
reps<-numeric(R)
cor0<-cor(X,Y,method = "spearman")
for(i in 1:R){
  m<-sample(K,size=20,replace = F)
  X1<-Z[m]
  Y1<-Z[-m]
  reps[i]<-cor(X1,Y1,method = "spearman")
}
p<-mean(c(cor0,reps)>=cor0)
p

## -----------------------------------------------------------------------------
cor.test(X1,Y1,method = "spearman")

## -----------------------------------------------------------------------------
library(RANN)
library(energy)
library(Ball)
library(boot)
# Tn function
Tn <- function(z, ix, sizes,k){
   n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2 
   if(is.vector(z)) z <- data.frame(z,0);
   z <- z[ix, ];
   NN <- nn2(data=z, k=k+1) # what's the first column? 
   block1 <- NN$nn.idx[1:n1,-1]
   block2 <- NN$nn.idx[(n1+1):n,-1]
   i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
   (i1 + i2) / (k * n)
}
# power for NN
eqdist.nn <- function(z,sizes,k){
   boot.obj <- boot(data=z,statistic=Tn,R=R, 
                    sim = "permutation", sizes = sizes,k=k) 
   ts <-c(boot.obj$t0,boot.obj$t)
   p.value <- mean(ts>=ts[1]) 
   list(statistic=ts[1],p.value=p.value)
}

## -----------------------------------------------------------------------------
library(MASS)
m <- 1e2; k<-3; set.seed(2)
n1 <- n2 <- 20; R<-999; n <- n1+n2; N = c(n1,n2)
mu1<-c(0,0,0)
sigma1<-matrix(c(1,0,0,0,1,0,0,0,1),ncol=3)
mu2<-c(0,0,0)
sigma2<-matrix(c(2,0,0,0,4,0,0,0,6),ncol=3)
p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <-mvrnorm(n1,mu1,sigma1) ;
  y <-mvrnorm(n2,mu2,sigma2) ;
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,num.permutations=999,seed=i*2)$p.value
}
alpha <- 0.05; 
pow <- colMeans(p.values<alpha)
pow

## -----------------------------------------------------------------------------
library(MASS)
m <- 1e2; k<-3; set.seed(2)
n1 <- n2 <- 20; R<-999; n <- n1+n2; N = c(n1,n2)
mu1<-c(0,0,0)
sigma1<-matrix(c(1,0,0,0,1,0,0,0,1),ncol=3)
mu2<-c(0.5,-0.5,0.5)
sigma2<-matrix(c(2,0,0,0,2,0,0,0,2),ncol=3)
p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <-mvrnorm(n1,mu1,sigma1) ;
  y <-mvrnorm(n2,mu2,sigma2) ;
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,num.permutations=999,seed=i*2)$p.value
}
alpha <- 0.05; 
pow <- colMeans(p.values<alpha)
pow

## -----------------------------------------------------------------------------
m <- 1e2; k<-3;p<-2;  set.seed(12345)
n1 <- n2 <- 20; R<-999; n <- n1+n2; N = c(n1,n2)
p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <- matrix(rt(n1*p,df=1,ncp =10),ncol=p);
  y <- matrix(rt(n2*p,df=1,ncp = 20),ncol=p);
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,num.permutations=999,seed=i*12345)$p.value
}
alpha <- 0.05; 
pow <- colMeans(p.values<alpha)
pow

## -----------------------------------------------------------------------------
bimodel<-function(r,n){
  p<-sample(c(0,1),n,replace = T,prob = c(1-r,r))
  x<-p*rnorm(n)+(1-p)*rnorm(n,0,2)
  return(x)
}
m <- 1e2; k<-3;p<-2;  set.seed(12345)
n1 <- n2 <- 20; R<-999; n <- n1+n2; N = c(n1,n2)
p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <- matrix(bimodel(0.85,n1*p),ncol=p);
  y <- matrix(bimodel(0.15,n2*p),ncol=p);
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,num.permutations=999,seed=i*12345)$p.value
}
alpha <- 0.05; 
pow <- colMeans(p.values<alpha)
pow

## -----------------------------------------------------------------------------
library(MASS)
m <- 1e2; k<-3; set.seed(2)
n1 <-10 ;n2 <- 100; R<-999; n <- n1+n2; N = c(n1,n2)
mu1<-c(0,0,0)
sigma1<-matrix(c(1,0,0,0,1,0,0,0,1),ncol=3)
mu2<-c(0,0,0)
sigma2<-matrix(c(4,0,0,0,4,0,0,0,8),ncol=3)
p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <-mvrnorm(n1,mu1,sigma1) ;
  y <-mvrnorm(n2,mu2,sigma2) ;
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,num.permutations=999,seed=i*2)$p.value
}
alpha <- 0.05; 
pow <- colMeans(p.values<alpha)
pow

## -----------------------------------------------------------------------------
#function
metropolis<-function(sigma,x0,N,k){
  set.seed(k)
  x<-numeric(N)
  x[1]<-x0
  u<-runif(N)
  t<-0
  for(i in 2:N){
    y<-rnorm(1,x[i-1],sigma)
    if(u[i]<=(dcauchy(y,0,1)/dcauchy(x[i-1],0,1))){
      x[i]<-y
    }
    else{
      x[i]<-x[i-1]
      t<-t+1
    }
  }
  return(list(x=x,t=t))
}
#trial
N<-1e4
sigma<-c(1,2,3,4)
x0<-0
m1<-metropolis(sigma[1],x0,N,1)
m2<-metropolis(sigma[2],x0,N,2)
m3<-metropolis(sigma[3],x0,N,3)
m4<-metropolis(sigma[4],x0,N,4)
#compare the deciles
a<-seq(0.1,0.9,0.1)
Q<-qcauchy(a,0,1)#the standard cauchy distribution
m<-cbind(m1$x,m2$x,m3$x,m4$x)
mc<-m[1001:N,]
Qm<-apply(mc,2,function(x) quantile(x,a))
knitr::kable(round(cbind(Q=Q,m1=Qm[,1],m2=Qm[,2],m3=Qm[,3],m4=Qm[,4]),3))

## -----------------------------------------------------------------------------
a<-1
b<-5
n<-10
N<-10000 # length of chain
burn<-1000 # burn-in length
Z<-matrix(0,N,2)
Z[1,]<-c(1,0.1)# initial
for(i in 2:N){
  y<-Z[i-1,2]
  Z[i,1]<-rbinom(1,n,y)
  x<-Z[i,1]
  Z[i,2]<-rbeta(1,x+a,n-x+b)
}
b<-burn+1
z<-Z[b:N,]# the chain
colMeans(z)
cov(z)
cor(z)
plot(z,main="",cex=0.5,xlab = bquote(X),ylab = bquote(Y),ylim = range(z[,2]))

## -----------------------------------------------------------------------------
#Gelman.Rubin convergence
Gelman.Rubin <- function(psi) {
  psi <- as.matrix(psi)
  n <- ncol(psi)
  t <- nrow(psi)
  psi.means <- rowMeans(psi)
  B <- n * var(psi.means)
  psi.w <- apply(psi, 1, "var") 
  W <- mean(psi.w)
  v.hat <- W*(n-1)/n + (B/n) 
  r.hat <- v.hat / W 
  return(r.hat)
}

## -----------------------------------------------------------------------------
#trial
set.seed(1)
x0<-c(-20,-10,10,20)# initial
k<-4# number of chains
n<-15000# length of chains
b<-10# burn-in length
u<-runif(n)
X<-matrix(0,nrow=k,ncol = n)
rhat<-rep(0,n)
X[,1]<-x0
for(j in 2:n){
  for(i in 1:k){
    y<-rcauchy(1,X[i,j-1],6)
    if(u[j]<=(dcauchy(y,0,1)/dcauchy(X[i,j-1],0,1))){
      X[i,j]<-y
    }
    else{
      X[i,j]<-X[i,j-1]
    }
  }
  psi <- t(apply(X[,1:j], 1, cumsum)) 
  for (m in 1:nrow(psi)){
     psi[m,] <- psi[m,] / (1:ncol(psi)) 
  }
  rhat[j]<-Gelman.Rubin(psi)
  if(rhat[j]<=1.2){
    count<-j
    print(count)
    break
  }
}
plot(rhat[(b+1):count], type="l", xlab="", ylab="R") 
abline(h=1.2, lty=2)

## -----------------------------------------------------------------------------
a<-5
b<-5
n<-20
k<-4
N<-5000 # length of chain
burn<-300 # burn-in length
set.seed(0)
X<-matrix(0,nrow=k,ncol=N)
Y<-matrix(0,nrow=k,ncol=N)
X[,1]<-c(1,2,3,4)
Y[,1]<-c(0.1,0.2,0.3,0.4)
rhatx<-rep(0,N)
rhaty<-rep(0,N)
for(j in 2:N){
  for(i in 1:k){
    y<-Y[i,j-1]
    X[i,j]<-rbinom(1,n,y)
    x<-X[i,j]
    Y[i,j]<-rbeta(1,x+a,n-x+b)
  }
  psi1 <- t(apply(X[,1:j], 1, cumsum)) 
  for (m in 1:nrow(psi1)){
     psi1[m,] <- psi1[m,] / (1:ncol(psi1)) 
  }
  psi2 <- t(apply(Y[,1:j], 1, cumsum)) 
  for (m in 1:nrow(psi2)){
     psi2[m,] <- psi2[m,] / (1:ncol(psi2)) 
  }
  rhatx[j]<-Gelman.Rubin(psi1)
  rhaty[j]<-Gelman.Rubin(psi2)
  if(rhatx[j]<=1.2&&rhaty[j]<=1.2){
    count<-j
    print(count)
    break
  }
}

## -----------------------------------------------------------------------------
plot(rhatx[(burn+1):count], type="l", xlab="", ylab="R",ylim=c(1.15,2.2),xlim=c(0,1800)) 
lines(rhaty[(burn+1):count],type="l",col="red")
abline(h=1.2, lty=2)

## -----------------------------------------------------------------------------
kth<-function(a,i){
  d<-length(a)
  if(i==0){
   b<-sum(a^2)*exp(lgamma((d+1)/2)+lgamma(0+3/2)-lgamma(0+d/2+1))/2 # i=0 
  }
  if(i>=1){
    b<-(-1)^i*exp((i+1)*log(sum(a^2))-sum(log(1:i))-i*log(2)-log(2*i+1)-log(2*i+2)) *
      exp(lgamma((d+1)/2)+lgamma(i+3/2)-lgamma(i+d/2+1))
  }
  return(b)
}
kth(c(1,2),0);kth(c(1,2),1)

## -----------------------------------------------------------------------------
Sum<-function(a,n){
  d<-length(a)
  b<-kth(a,0)
  if(n==0){
    return(b)
  }
  if(n>=1){
    for(i in 1:n){
    b<-b+kth(a,i)
    }
    return(b)
  }
}
Sum(c(1,2),10);Sum(c(1,2),100)

## -----------------------------------------------------------------------------
N<-10000 # the total number of trials
a<-c(1,2)
value<-numeric(N)
value[1]<-Sum(a,0)
value[2]<-Sum(a,1)
for(k in 1:N){
    value[k+1]<-Sum(a,k)
    if(abs(value[k+1]-value[k])<=0.000001){
      print(value[k+1])
      break
    }
}

## -----------------------------------------------------------------------------
ck<-function(a,k){
  return(sqrt(a^2*k/(k+1-a^2)))
}
solve<-function(a){
  nu<-1/2*log(k)+2*lgamma(k/2)+log(integrate(function(u){(1+u^2/(k-1))^(-k/2)},0,ck(a,k-1))$value)
  de<-1/2*log(k-1)+lgamma((k-1)/2)+lgamma((k+1)/2)+
    log(integrate(function(u){(1+u^2/k)^(-(k+1)/2)},0,ck(a,k))$value)
  return(nu-de)
}

## -----------------------------------------------------------------------------
m<-c(4:25,100,500,1000)
solution<-numeric(length(m))
for(i in 1:length(m)){
  k<-m[i]
  if(k<=25){
    a<-seq(0.01,sqrt(k)-0.01,0.01)
  }
  if(k>25){
    a<-seq(0.01,4,0.01)
  }
  y<-numeric(length(a))
  for(j in 1:length(a)){
    y[j]<-solve(a[j])
  }
  plot(a,y,type="l",main=bquote(k==.(m[i])))
  abline(h=0,col="red")
}

## -----------------------------------------------------------------------------
for(i in 1:length(m)){
  k<-m[i]
  solution[i]<-uniroot(function(a){solve(a)},lower=1,upper=2)$root
}
round(solution,4)

## -----------------------------------------------------------------------------
solve2<-function(k){
  value<-uniroot(function(a){pt(ck(a,k),df=k)-pt(ck(a,k-1),df=k-1)},lower=1,upper=2)
  value$root
}
for(i in 1:length(m)){
  k<-m[i]
  solution[i]<-solve2(k)
}
round(solution,4)

## -----------------------------------------------------------------------------
Y<-c(0.54, 0.48, 0.33, 0.43, 1.00, 1.00, 0.91, 1.00, 0.21, 0.85)
lambda<-0.375# initial
N<-10000
L<-numeric(N)
L[1]<-lambda
for(i in 2:N){
  L[i]<-(3*L[i-1]+sum(Y))/10 # for the value of 3+Y_1+\dots+Y_4+Y_7+Y_9+Y_{10} equals to sum(Y)
  if(abs(L[i]-L[i-1])<=0.0000001){
    print(L[i])
    break
  }
}

## -----------------------------------------------------------------------------
set.seed(0)
trims<-c(0,0.1,0.2,0.5)
x<-rcauchy(100)

lapply(trims, function(trim) mean(x,trim=trim))
lapply(trims, mean, x=x)

## -----------------------------------------------------------------------------
attach(mtcars)
rsq<-function(mod) summary(mod)$r.squared
formulas<-list(
  mpg ~ disp,
  mpg ~ I(1/disp),
  mpg ~ disp + wt,
  mpg ~ I(1/disp) + wt
)
la<-lapply(formulas, function(x) lm(formula = x, data = mtcars))
RSQ1<-lapply(la, function(x) rsq(x))
RSQ1

## -----------------------------------------------------------------------------
bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ]
})
la2<-lapply(bootstraps, function(x) lm(mpg ~ disp,data = x))
RSQ2<-lapply(la2,function(x) rsq(x))
RSQ2

## -----------------------------------------------------------------------------
set.seed(1)
n<-5
data1<-data.frame(alpha=rnorm(n),beta=rexp(n,1))
vapply(data1,sd,FUN.VALUE =c(sd=0))
sd(data1$alpha);sd(data1$beta)

## -----------------------------------------------------------------------------
set.seed(1)
n<-3
data2<-data.frame(alpha=rnorm(n,1,1),beta = rbeta(n,1,1), gamma=c(1,"girl","boy"))
data2
# find the numeric column and pick them up to caculate the standard deviation
vapply(data2[,vapply(data2,is.numeric,logical(1))], sd, FUN.VALUE = c(sd=0))
sd(data2$alpha);sd(data2$beta)

## -----------------------------------------------------------------------------
library(parallel)
sapply

## -----------------------------------------------------------------------------
mcsapply<-function(X, FUN, ..., mc.preschedule = TRUE, mc.set.seed = TRUE, 
    mc.silent = FALSE, mc.cores = getOption("mc.cores", 2L), 
    mc.cleanup = TRUE, mc.allow.recursive = TRUE, affinity.list = NULL, 
    simplify = TRUE, USE.NAMES=TRUE) {
   FUN <- match.fun(FUN)
    answer <- mclapply(X, FUN, ..., mc.preschedule = TRUE, mc.set.seed = TRUE, 
    mc.silent = FALSE, mc.cores = getOption("mc.cores", 2L), 
    mc.cleanup = TRUE, mc.allow.recursive = TRUE, affinity.list = NULL)
    if (USE.NAMES && is.character(X) && is.null(names(answer))) 
        names(answer) <- X
    if (!isFALSE(simplify)) 
        simplify2array(answer, higher = (simplify == "array"))
    else answer
}
boot_df <- function(x) x[sample(nrow(x), rep = T), ] 
rsquared <- function(mod) summary(mod)$r.square 
boot_lm <- function(i) {
  dat <- boot_df(mtcars)
  rsquared(lm(mpg ~ wt + disp, data = dat))
}
n <- 1e4
system.time(sapply(1:n, boot_lm))
system.time(mcsapply(1:n,boot_lm, mc.cores = 4))

## -----------------------------------------------------------------------------
vapply

## -----------------------------------------------------------------------------
library(Rcpp)
library(StatComp21038)

## ----eval=FALSE---------------------------------------------------------------
#  #include <Rcpp.h>
#  using namespace Rcpp;
#  // [[Rcpp::export]]
#  NumericMatrix gibbsC(int N, int thin) {
#      NumericMatrix mat(N, 2);
#      double x = 1, y = 0.1;
#      int a=1, b=5, n=10;
#      for(int i = 0; i < N; i++) {
#          for(int j = 0; j < thin; j++) {
#              x = rbinom(1, n, y)[0];
#              y = rbeta(1, x+a, n-x+b)[0];
#          }
#          mat(i, 0) = x;
#          mat(i, 1) = y;
#      }
#      return(mat);
#  }

## -----------------------------------------------------------------------------
gibbsR <- function(N,thin) {
  mat <- matrix(nrow = N, ncol = 2)
  x <- 1;y <- 0.1
  a<-1;b<-5;n<-10
  for (i in 1:N) {
    for (j in 1:thin){
      x <- rbinom(1, n, y)
      y <- rbeta(1, x+a, n-x+b)
    }
    mat[i, ] <- c(x, y)
  }
  mat
}

## -----------------------------------------------------------------------------
set.seed(1)
N<-1e3
burn<-1e2
gibbc<-gibbsC(N,10)
gibbr<-gibbsR(N,10)
a<-ppoints(100)
Q11<-quantile(gibbc[(burn+1):N,1],a)
Q21<-quantile(gibbr[(burn+1):N,1],a)
qqplot(Q11,Q21,main="X",
       xlab = "Rcpp",ylab="R")
abline(0,1,col="red")
Q12<-quantile(gibbc[(burn+1):N,2],a)
Q22<-quantile(gibbr[(burn+1):N,2],a)
qqplot(Q12,Q22,main="Y",
       xlab = "Rcpp",ylab="R")
abline(0,1,col="red")

## -----------------------------------------------------------------------------
library(microbenchmark)
ts <- microbenchmark(gibbR=gibbsR(N,10), 
                     gibbC=gibbsC(N,10))
  summary(ts)[,c(1,2,3,4,5,6,7)]
USTCLifengLiu/StatComp21038 documentation built on Dec. 23, 2021, 10:18 p.m.