Example 1
This is a text example.
Example 2
There is a table.
|Age|Born|Died|Name|Occupation| |---|---|---|---|---| |37|1920-07-25|1958-04-16|Rosaline Franklin|Chemist| |61|1876-06-13|1937-10-16|William Gosset|Statistician|
Example 3
There is a figure.
y <- runif(1000) x <- -log(1-y) # 指数分布 hist(x,prob = TRUE) z <- seq(0,8,.01) lines(z,exp(-z))
We can calculate the CDF of a Rayleigh(σ) distribution.
$$F(x) = \int_{-\infty}^x f(t)dt = \begin{cases} 0,&x\lt0,\ \int_0^x {t\over\sigma^2}e^{-{t\over 2\sigma^2}}dt = 1-e^{-{x^2\over 2\sigma^2}} , &x\ge0. \end{cases} $$
Then, we can get $$F^{-1}(u)=\sqrt{-2\sigma^2ln(1-u)} $$ when $x\ge0$
So in order to generate random samples from a Rayleigh($\sigma$),we can generate $U\sim U(0,1)$ first,and return $x=\sqrt{-2\sigma^2ln(u)}$.
Here are some graphics for different \sigma in which we can compare the results between the algorithm above and the theoretical method.
sigma <- c(0.1,1,10) for (i in 1:length(sigma)){ m <- 1000 u <- runif(m) x <- sqrt(-2*sigma[i]^2*log(u)) {if(sigma[i]==0.1){ hist(x,freq=F,main=expression(sigma==0.1),col=rainbow(5))} else if(sigma[i]==1) {hist(x,freq=F,main=expression(sigma==1),col=rainbow(5))} else {hist(x,freq=F,main=expression(sigma==10),col=rainbow(5))}} y <- seq(0,100,0.1) lines(y,y/sigma[i]^2*exp(-y^2/(2*sigma[i]^2))) }
The algorithm performs very good.
m <- 1000 r1 <- rnorm(m) r2 <- rnorm(m,mean=3) p1 <- 0.75 p2 <- 1-p1 p0 <- sample(c(0,1),m,replace=TRUE,prob=c(p2,p1)) r <- p0*r1 + (1-p0)*r2 hist(r,freq=F,main=expression(p1==0.75),col=2) p <- c(0.1,0.5,0.9) for(i in 1:length(p)){ r01 <- rnorm(m) r02 <- rnorm(m,3) p00 <- sample(c(0,1),m,replace=TRUE,prob=c(1-p[i],p[i])) r0 <- p00*r01 + (1-p00)*r02 if(p[i]==0.1){ hist(r0,freq=F,main=expression(p1==0.1),col=i) }else if(p[i]==0.5){ hist(r0,freq=F,main=expression(p1==0.5),col=i) }else{ hist(r0,freq=F,main=expression(p1==0.9),col=i) } }
$p_1$ may decide the weigh of the two distribution. And when $p_1$ is small, the performance of the mixture distribution is similar to the first distribution;when $p_1$ is big , the performance of the mixture distribution is similar to another;when $p_1$ is close to 0.5, there will be a bimodal ditribution.
lambda <- c(1,2,3,4,5) t0 <- 10 r <-c(2,9,6,4,7) beta <- c(0.5,3,2,0.8,7) for (i in 1:length(r)){ X <- numeric(10000) for (j in 1:10000) { N <- rpois(1,lambda[i]*t0) Tn <- rgamma(1000,r[i],beta[i]) Xn <- sum(Tn[1:N]) X[j] <- Xn } thrye <- lambda[i]*t0*(r[i]/beta[i]) #由公式得出的x(t)的期望 thryv <- lambda[i]*t0*((r[i]/beta[i])^2+r[i]/beta[i]^2)# 由公式得到的x(t)的方差 if (i==1){ print('理论期望 样本均值 理论方差 样本方差') } print(c(thrye,mean(X),thryv,var(X)))#输出的顺序从左到右为理论期望 样本均值 理论方差 样本方差 }
The mean and variance is close to the theoretical values.
The problem is to estimate $\theta = \int_0^x t^2(1-t)^2dt$. Making the substitution $u = \frac{t}{x}$ Then, we get $$\theta = \int_0^1 x(ux)^2(1-ux)^2du = \int_0^1 u^2x^3(1-ux)^2du$$ Thus, $\theta = E_u[u^2x^3(1-ux)^2]$,where the random variable U has the Uniform(0,1)distribution.
So we can generate the compute a Monte Carlo estimate of the Beta(3,3) cdf as follows
betacdf <-function(x,y,z){ if (length(x)==1){ if(0<x&x<1){ u <- runif(10000) theta <- (u^2)*(x^3)*(1-u*x)^2 beta0 <- mean(theta)/beta(y,z) return(beta0) } else if(x<0){ return(0) }else{ return(1) }} else{ beta1 <- numeric(length(x)) for (i in 1:length(x)){ beta1[i]=betacdf(x[i],y,z) } return(beta1) } } x <- seq(0.1,0.9,length = 9) cdf <- betacdf(x,3,3) pbeta <- pbeta(x,3,3) print(round(rbind(x,cdf,pbeta),3))
The algorithm performs very good.
We can calculate the CDF of a Rayleigh(σ) distribution.
$$F(x) = \int_{-\infty}^x f(t)dt = \begin{cases} 0,&x\lt0,\ \int_0^x {t\over\sigma^2}e^{-{t\over 2\sigma^2}}dt = 1-e^{-{x^2\over 2\sigma^2}} , &x\ge0. \end{cases} $$
Then, we can get $$F^{-1}(u)=\sqrt{-2\sigma^2ln(1-u)} $$ when $x\ge0$
So in order to generate random samples using antithetic variables from a Rayleigh($\sigma$),we can generate $U\sim U(0,1)$ first,and return $x=\sqrt{-2\sigma^2ln(u)}$ and $x'=\sqrt{-2\sigma^2ln(1-u)}$
Herea are 200 samples.And the percent reduction in variance of $x+x'\over 2$ compared with $x_1+x_2\over 2$ for independent X1, X2.
sigma <- 0.1 m <- 100 u <- runif(m) x <- sqrt(-2*sigma^2*log(u)) xp <- sqrt(-2*sigma^2*log(1-u)) print(round(rbind(x,xp),3)) MCa <- function(x, R = 10000, antithetic = TRUE) { u <- runif(R/2) if (!antithetic){ v <- runif(R/2) }else{ v <- 1 - u } u <- c(u, v) cdf <- numeric(length(x)) for (i in 1:length(x)) { g <- u*x[i]*exp(-u*x[i]/(sigma^2)) cdf[i] <- mean(g) / (sigma^2) } cdf } n <- 1000 MC1 <- MC2 <- numeric(m) x <- 1.95 for (i in 1:m) { MC1[i] <- MCa(x, R = 1000, anti = FALSE) MC2[i] <- MCa(x, R = 1000) } print((var(MC1) - var(MC2))/var(MC1))
Intuitively, We can choose importance function: $$f_1(x)= {1\over \sqrt {2\pi}}e^{-{x^2 \over 2}},0\le x<+\infty$$
And we can make a substitution $u={x^2\over 2}$,then $\int_{1}^{\infty} \frac{x^{2}}{\sqrt{2 \pi}} e^{-x^{2} / 2} d x=\int_{1\over2}^\infty {u^{1\over2}\over \sqrt {2\pi}}e^{-u}du$ So ,we can choose importance function: $$f_2(x)=xe^{-{x^2\over 2}},0<x<+\infty$$
In fact ,if the pdf of random variables x is $f_2(x)$,then x is with distribution P(1).
Then we plot the functions for comparison with g(x).
x <- seq(1,20,by = 0.1) g <- x^2/sqrt(2*pi)*exp((-x^2)/2) f1 <- 1/sqrt(2*pi)*exp((-x^2)/2) f2 <- x*exp((-x^2)/2) plot(x,g) lines(x,f1,col = 'red')#f1是红色 lines(x,f2,col = 'blue')#f2是蓝色
According to the graphic, we might prefer $f_1$ for the smallest variance.
x <- abs(runif(10000)) y <- rexp(10000) fg1 <- x^2 fg2 <- y/sqrt(2*pi) theta1 <- mean(fg1) theta2 <- mean(fg2) se1 <- sd(fg1) se2 <- sd(fg2) theta <- c(theta1,theta2) se <- c(se1,se2) print(rbind(theta,se))
So $f_1$ is better than $f_2$.
Here is the theta(left) and standard deviation(right):
x <- abs(runif(10000)) fg1 <- x^2 theta1 <- mean(fg1) se1 <- sd(fg1) print(c(theta1,se1))
Suppose a $95 \%$ symmetric $t$-interval is applied to estimate a mean, but the sample data are non-normal. Then the probability that the confidence interval covers the mean is not necessarily equal to $0.95$. Use a Monte Carlo experiment to estimate the coverage probability of the $t$-interval for random samples of $\chi^{2}(2)$ data with sample size $n=20$. Compare your $t$-interval results with the simulation results in Example 6.4. (The $t$-interval should be more robust to departures from normality than the interval for variance.)
Solution.
The expectation of $\chi^2(2)$ distribution is $2$, while the variance is $4$.
t-intervals
We calculate t-intervals based on 1000 replicates.
n <- 20 alpha <- 0.05 set.seed(1) # calculate the half length of CIs CL <- replicate(1000, expr = { x <- rchisq(n, df = 2) mu.hat <- mean(x) sd.hat <- sd(x) sqrt(n) * abs(mu.hat - 2) / sd.hat }) # calculate coverage proportion mean(CL < qt(1 - alpha / 2, df = n - 1))
The proportion that the genuine mean locates in the esitimated CI is $0.926$, which is a little bit less than the theorical one ($0.95$).
interval for variance
We calculate intervals for variance based on 1000 replicates.
set.seed(1) R <- 1000 n <- 20 alpha <- 0.05 cover <- numeric(R) for (i in 1:R) { x <- rchisq(n, df = 2) UpperBound <- (n - 1) * var(x) / qchisq(alpha, df = n - 1) cover[i] <- (UpperBound > 4) } (ECP <- sum(cover) / R)
The proportion that the genuine variance locates in the esitimated CI is $0.797$, which is far less than the theoretical one ($0.95$).
As demonstrated by the above results ($0.926>0.797$), the t-interval is more robust to departures from normality than the interval for variance.
Use Monte Carlo simulation to investigate whether the empirical Type I error rate of the $t$-test is approximately equal to the nominal significance level $\alpha$, when the sampled population is non-normal. The $t$-test is robust to mild departures from normality. Discuss the simulation results for the cases where the sampled population is (i) $\chi^{2}(1)$, (ii) Uniform $(0,2)$, and (iii) Exponent$\operatorname{tial}(\mathrm{rate}=1)$. In each case, test $H_{0}: \mu=\mu_{0}$ vs $H_{0}: \mu \neq \mu_{0}$, where $\mu_{0}$ is the mean of $\chi^{2}(1)$, Uniform $(0,2)$, and Exponential $(1)$, respectively.
Solution.
The $t$-test for whether $\mu=\mu_0$ can be characterized as: $$H_0:\mu=\mu_0\longleftrightarrow H_1:\mu\neq\mu_0$$
以3维随机向量为例
library(MASS) sigma <- matrix(c(1,0,0,0,1,0,0,0,1),3,3) #协方差矩阵 n <- c(10,20,30,50,100,200,500) #样本量 d <- 3 # 维度 cv <- qchisq(.95, d*(d+1)*(d+2)/6) #置信界 b1d <- function(x){ k <- nrow(x) b1dm <- matrix(rep(0,k^2),nrow=k,ncol = k) x.lc <- matrix(rep(0,d*k),nrow=k,ncol = d) for (t in 1:d){ x.lc[,t] <- x[,t]-mean(x[,t]) } sigma.hat <- t(x.lc)%*%x.lc/k b1dm <- (x.lc%*%solve(sigma.hat)%*%t(x.lc))^3 return(sum(b1dm)/k^2) } p.reject <- numeric(length(n)) #储存结果 m <- 1000 #每个样本量重复m次实验 for (i in 1:length(n)) { mst <- numeric(m) #储存每次实验的检验结果 for (j in 1:m) { x <- mvrnorm(n[i],rep(0,3), sigma) mst[j] <- as.integer(n[i]*b1d(x)/6 >= cv)#拒绝原假设为1,左侧拒绝域 } p.reject[i] <- mean(mst)#拒绝原假设的频率 } print(p.reject)
从结果可以看出,随着样本量的增加,范第一类错误的概率逐渐靠近显著性水平0.05.
以3维向量为例
#sigma <- matrix(c(1,0,0,0,1,0,0,0,1),3,3) #协方差矩阵 n <- 30 #样本量 #d <- 3 # 维度 cv <- qchisq(.9, d*(d+1)*(d+2)/6) #置信界 p <- seq(0,1,0.02) pow <- numeric(length(p)) #储存结果 m <- 100 #每个p重复m次实验 for (i in 1:length(p)) { mst <- numeric(m) #储存每次实验的检验结果 for (j in 1:m) { x <- matrix(rep(0,3*n),n,3) for (t in 1:n){ sigma1 <- sample(c(1,10),size = 1,replace = TRUE,prob = c(p[i], 1-p[i])) sigma2 <- diag(sigma1,3,3) x[t,] <- mvrnorm(1,rep(0,3), sigma2) } mst[j] <- as.integer(n*b1d(x)/6 >= cv)#拒绝原假设为1,左侧拒绝域 } pow[i] <- mean(mst)#拒绝原假设的频率 } plot(p, pow, type = "b",xlab = bquote(epsilon), ylim = c(0,1)) abline(h = .1, lty = 3) se <- sqrt(pow * (1-pow) / m) lines(p, pow+se, lty = 3) lines(p, pow-se, lty = 3)
显然,当p也就是$\epsilon$为1或0时,此分布是多维正态分布,当$\epsilon$为0到1之间时,不是多维正态分布,符合预期。
library(bootstrap) set.seed(12121) n <- nrow(scor) m <- 1000 lambda.hat <- eigen(cov(scor))$values theta.hat <- lambda.hat[1]/sum(lambda.hat) theta.star <- numeric(m) for (i in 1:m){ index <- sample(1:n,n,replace = TRUE) lambda.star <- eigen(cov(scor[index,]))$values theta.star[i] <- lambda.star[1]/sum(lambda.star) } c(theta.hat= theta.hat,boot.bias_theta=mean(theta.star) - theta.hat,boot.se_theta=sd(theta.star))
theta.j <- numeric(n) for (j in 1:n){ lambda <- eigen(cov(scor[-j,]))$values theta.j[j] <- lambda[1]/sum(lambda) } c(jack.bias_theta=(n-1)*(mean(theta.j) - theta.hat),jack.se_theta=sqrt((n-1)*mean((mean(theta.j)-theta.j)^2)))
library(boot) set.seed(12121) theta.boot <- function(data,i){ lamdb <- eigen(cov(data[i,]))$values lamdb[1] / sum(lamdb) } boot.obj <- boot(scor,statistic = theta.boot,R = 2000) print(boot.ci(boot.obj,type = c("perc","bca")))
set.seed(12345) nor <- 999 x <- rchisq(10,10) y <- rexp(10,0.1) z <- c(x, y) N <- 1:20 cor <- numeric(nor) cor0 <- cor(x, y, method = "spearman") p0 <- cor.test(x, y,method = "spearman")$p.value for (i in 1:nor) { n <- sample(N, size = 10, replace = FALSE) xs <- z[n] ys <- z[-n] #complement of x1 cor[i] <- cor(xs, ys, method = "spearman") } p <- mean(abs(c(cor0, cor)) > abs(cor0)) c("cor.test" = p0, "permutation" = p)
我们随机生成了两组随机数,并用cor.test和permutation方法分别计算了双侧相关系数检验的p值(即ASL),得到的结果很接近。
library(boot) library(RANN) library(energy) library(Ball) NNF <- function(z, ix, sizes,k) { n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2 if(is.vector(z)) z <- data.frame(z); z <- z[ix, ]; NN <- nn2(data=z, k=k+1) block1 <- NN$nn.idx[1:n1,-1] block2 <- NN$nn.idx[(n1+1):n,-1] i1 <- sum(block1 <= n1); i2 <- sum(block2 > n1) (i1 + i2) / (k * n) } p <- function(x,y){ z <- rbind(x,y) N <- c(nrow(x),nrow(y)) energyp.value <- eqdist.etest(z, sizes=N, R=999)$p.value ballp.value <- bd.test(x = x, y = y, num.permutations=999)$p.value boot.obj <- boot(z,statistic = NNF, R = 999, sim = "permutation", sizes = N, k = 3) ann <- c(boot.obj$t0,boot.obj$t) NNp.value <- mean(ann>=ann[1]) c(NNp.value, energyp.value, ballp.value) }
Unequal variances and equal expectations
library(MASS) set.seed(12345) e1 = c(0,0) cov1 = matrix(c(1,0,0,1),nrow=2,ncol=2) e2 = c(0,0) cov2 = matrix(c(3,0,0,3),nrow=2,ncol=2) p.values <- matrix(NA,100,3) for (i in 1:100){ x <- mvrnorm(20,e1,cov1) y <- mvrnorm(20,e2,cov2) p.values[i,] <- p(x,y) } pwr1 <- colMeans(p.values<0.05)#alpha取0.05 c("NN.power" = pwr1[1], "energy.power" = pwr1[2],"ball.power" = pwr1[3])
所以ball方法最好,NN和energy方法相差不大。
Unequal variances and unequal expectations
e1 = c(0,0) cov1 = matrix(c(1,0,0,1),nrow=2,ncol=2) e2 = c(1,1) cov2 = matrix(c(2,0,0,2),nrow=2,ncol=2) p.values <- matrix(NA,100,3) for (i in 1:100){ x <- mvrnorm(20,e1,cov1) y <- mvrnorm(20,e2,cov2) p.values[i,] <- p(x,y) } pwr1 <- colMeans(p.values<0.05)#alpha取0.05 c("NN.power" = pwr1[1], "energy.power" = pwr1[2],"ball.power" = pwr1[3])
ball和energy方法相差不大,且比NN好。
Non-normal distributions: t distribution with 1 df (heavy-tailed distribution)
p.values <- matrix(NA,100,3) for (i in 1:100){ x <- as.matrix(rt(20,1,2),ncol=1) y <- as.matrix(rt(20,2,5),ncol=1) p.values[i,] <- p(x,y) } pwr1 <- colMeans(p.values<0.05)#alpha取0.05 c("NN.power" = pwr1[1], "energy.power" = pwr1[2],"ball.power" = pwr1[3])
表现: ball>NN>energy
bimodel distribution (mixture of two normal distributions)
rbimodel <- function(n,e1,e2,sd1,sd2){ index <- sample(1:2,n,replace=TRUE) x <- numeric(n) index1<- which(index==1) x[index1] <- rnorm(length(index1), e1, sd1) index2 <- which(index==2) x[index2] <- rnorm(length(index2), e2, sd2) return(x) } p.values <- matrix(NA,100,3) for (i in 1:100){ x <- as.matrix(rbimodel(20,0,0,1,2),ncol=1) y <- as.matrix(rbimodel(20,1,1,1,2),ncol=1) p.values[i,] <- p(x,y) } pwr1 <- colMeans(p.values<0.05)#alpha取0.05 c("NN.power" = pwr1[1], "energy.power" = pwr1[2],"ball.power" = pwr1[3])
表现: energy>ball>NN
Unbalanced samples (say, 1 case versus 10 controls)
e1 = c(0,0) cov1 = matrix(c(1,0,0,1),nrow=2,ncol=2) e2 = c(1,1) cov2 = matrix(c(2,0,0,2),nrow=2,ncol=2) p.values <- matrix(NA,100,3) for (i in 1:100){ x <- mvrnorm(10,e1,cov1) y <- mvrnorm(100,e2,cov2) p.values[i,] <- p(x,y) } pwr1 <- colMeans(p.values<0.05)#alpha取0.05 c("NN.power" = pwr1[1], "energy.power" = pwr1[2],"ball.power" = pwr1[3])
根据计算结果可得,ball和energy方法较好,NN方法略差。
Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard
Cauchy distribution (see qcauchy or qt with df=1). Recall that a $Cauchy(\theta,\eta)$ distribution has density function
$$f(x)=\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},\quad -\infty
解答:
1.由于标准Cauchy分布为对称的,所以不妨取proposal distribution $g(\cdot|X)$也为对称的正态分布$N(0,X^2)$密度函数。观察易知标准Cauchy即为自由度为1的t分布。
N <- 10000 X <- numeric(N) b <- 1001 #discard the burn-in sample X[1] <- rnorm(1,0,1) for(i in 2:N){ Xt <- X[i-1] Y <- rnorm(1,0,abs(Xt)) r <- dt(Y,1)*dnorm(Xt,0,abs(Y))/dt(Xt,1)/dnorm(Y,0,abs(Xt)) U <- runif(1) if(r > 1) r <- 1 if(U <= r) X[i] <- Y else X[i] <- Xt } Y <- X[b:N] deciles_sample <- quantile(Y, c(1:9)/10) # 样本的十分位数 deciles_true <- qcauchy(c(1:9)/10,0,1) # 真实分布的十分位数 deciles <- rbind(deciles_sample,deciles_true) rownames(deciles) <- c("deciles_sample","deciles_true") colnames(deciles) <- c(c(1:9)/10) library(knitr) kable(deciles)
结果发现,样本十分位数与真实分布的十分位数几乎一致。不妨再观察一下QQ图。
a <- ppoints(100) QR <- qcauchy(a,0,1) Q <- quantile(Y, a) qqplot(QR, Q, main="", xlab="Standard Cauchy Quantiles", ylab="Sample Quantiles") abline(0,1,col = "red")
根据QQ图发现除了最两端的分位数有所差据,整体上来看分位数是一样的,综上说明了由Metropolis-Hastings方法得到样本的有效性。
This example appears in [40]. Consider the bivariate density $$f(x, y) \propto\left(\begin{array}{l} n \ x \end{array}\right) y^{x+a-1}(1-y)^{n-x+b-1}, \quad x=0,1, \ldots, n, 0 \leq y \leq 1.$$ It can be shown (see e.g. [23]) that for fixed a, b, n, the conditional distributions are Binomial(n, y) and Beta(x + a, n − x + b). Use the Gibbs sampler to generate a chain with target joint density f(x, y).
解答:
由参考文献知$(X|Y=y)\overset{d}= B(n,y),\quad (Y|X=x)\overset{d}= Beta(x+a,n-x+b)$,这里不妨取n为25。
a <- 1 b <- 1 N <- 10000 #样本量 X <- matrix(0, N, 2) #样本阵 X[1,] <- c(0,0.5) for(i in 2:N){ X2 <- X[i-1, 2] X[i,1] <- rbinom(1,25,X2) X1 <- X[i,1] X[i,2] <- rbeta(1,X1+a,25-X1+b) } plot(X[,1],X[,2],xlab = "x",ylab = "y")
a <- 1 b <- 10 X <- matrix(0, N, 2) #样本 X[1,] <- c(0,0.5) for(i in 2:N){ X2 <- X[i-1, 2] X[i,1] <- rbinom(1,25,X2) X1 <- X[i,1] X[i,2] <- rbeta(1,X1+a,25-X1+b) } plot(X[,1],X[,2],xlab = "x",ylab = "y")
For each of the above exercise, use the Gelman-Rubin method to monitor convergence of the chain, and run the chain until it converges approximately to the target distribution according to $\hat R<1.2$
解答:
# 计算Gelman-Rubin statistic的函数 Gelman.Rubin <- function(psi) { # psi[i,j] is the statistic psi(X[i,1:j]) # for chain in i-th row of X psi <- as.matrix(psi) n <- ncol(psi) k <- nrow(psi) psi.means <- rowMeans(psi) #row means B <- n * var(psi.means) #between variance est. psi.w <- apply(psi, 1, "var") #within variances W <- mean(psi.w) #within est. v.hat <- W*(n-1)/n + (B/n) #upper variance est. r.hat <- v.hat / W #G-R statistic return(r.hat) }
先按照9.3题写出构造标准柯西分布的Metropolis chain的函数
# 生成标准柯西分布的Metropolis chain # 提议函数仍取9.3中使用的对称正态分布 N(0,X[t]^2) # X1为初始值 Standard_Cauchy_Chain <- function(N, X1){ X <- numeric(N) X[1] <- X1 #初始值 for(i in 2:N){ Xt <- X[i-1] Y <- rnorm(1,0,abs(Xt)) r <- dt(Y,1)*dnorm(Xt,0,abs(Y))/dt(Xt,1)/dnorm(Y,0,abs(Xt)) U <- runif(1) if(r > 1) r <- 1 if(U <= r) X[i] <- Y else X[i] <- Xt } return(X) }
接下来不妨考虑生成4条上述Metropolis chain,每条样本量N=8000。
k <- 4 N <- 8000 b <- 1000 #burn-in length X1 <- c(0.1,0.2,0.1,0.2) #初始值 # 生成4条样本 set.seed(12345) X <- matrix(0, nrow = k, ncol = N) for(i in 1:k){ X[i,] <- Standard_Cauchy_Chain(N, X1[i]) } # compute diagnostic statistics psi <- t(apply(X, 1, cumsum)) for (i in 1:nrow(psi)) psi[i,] <- psi[i,] / (1:ncol(psi)) print(Gelman.Rubin(psi))
# 四条样本的psi for (i in 1:k) if(i==1){ plot((b+1):N,psi[i, (b+1):N],ylim=c(-1,1), type="l", xlab='Index', ylab=bquote(phi)) }else{ lines(psi[i, (b+1):N], col=i) } par(mfrow=c(1,1))
实际上发现四条样本的psi图并没有呈现逼近同一分布的结果,这可能是因为Cauchy分布的期望和方差不均存在,进而导致的估计不稳定性,下面再画出$\hat R$统计量v.s.样本量N的图。
par(mfrow=c(1,1)) #plot the sequence of R-hat statistics rhat <- rep(0, N) for (j in (b+1):N) rhat[j] <- Gelman.Rubin(psi[,1:j]) plot(rhat[(b+1):N], type="l", xlab="", ylab="R") abline(h=1.2, lty=2)
$\hat R$ 大概在样本为1000时达到收敛。
先按照9.8题写出题中二元随机变量的Gibbs sampler,这里不妨取a=b=1。
# 生成二元随机变量的Gibbs sampler # X1为初始值 Bivariate.Gibbs <- function(N, X1){ a <- b <- 1 X <- matrix(0, N, 2) X[1,] <- X1 #初始值 for(i in 2:N){ X2 <- X[i-1, 2] X[i,1] <- rbinom(1,25,X2) X1 <- X[i,1] X[i,2] <- rbeta(1,X1+a,25-X1+b) } return(X) }
不妨还是考虑生成4条样本,每条样本量N=8000.
k <- 4 N <- 8000 b <- 1000 #burn-in length X1 <- cbind(c(2,7,10,15),runif(4)) #初始值 #生成4条样本,每个第一维的放在X中,第二维的放在Y中 set.seed(12345) X <- matrix(0, nrow=k, ncol=N) Y <- matrix(0, nrow=k, ncol=N) for (i in 1:k){ BG <- Bivariate.Gibbs(N, X1[i,]) X[i, ] <- BG[,1] Y[i, ] <- BG[,2] }
下面分别在每一个维度上考虑利用Gelman-Rubin method考虑样本的收敛情况。
# 先考虑第一维样本X #compute diagnostic statistics psi <- t(apply(X, 1, cumsum)) for (i in 1:nrow(psi)) psi[i,] <- psi[i,] / (1:ncol(psi)) #plot the sequence of R-hat statistics rhat <- rep(0, N) for (j in (b+1):N) rhat[j] <- Gelman.Rubin(psi[,1:j]) plot(rhat[(b+1):N], type="l", xlab="", ylab="R") abline(h=1.2, lty=2)
# 再考虑第二维样本Y #compute diagnostic statistics psi <- t(apply(Y, 1, cumsum)) for (i in 1:nrow(psi)) psi[i,] <- psi[i,] / (1:ncol(psi)) #plot the sequence of R-hat statistics rhat <- rep(0, N) for (j in (b+1):N) rhat[j] <- Gelman.Rubin(psi[,1:j]) plot(rhat[(b+1):N], type="l", xlab="", ylab="R") abline(h=1.2, lty=2)
综合考虑两个维度的$\hat R$统计量,大约在样本为4000时可以达到收敛。
(a)
cal1 <- function(a,d,k,n){ jc <- 1 asquare <- sum(a^2) i <- 0:n j <- rep(c(1,-1), length=n+1) A <- ((j * asquare^(i+1)) / ((2*i+1)*(2*i+2)*factorial(i)*2^i)) * exp(lgamma((d+1)/2)+lgamma(i+1.5)-lgamma(i+d/2+1)) return(A[k+1]) }
(b)
cal2 <- function(a,d,k,n){ jc <- 1 asquare <- sum(a^2) i <- 0:n j <- rep(c(1,-1), length=n+1) A <- ((j * asquare^(i+1)) / ((2*i+1)*(2*i+2)*factorial(i)*2^i)) * exp(lgamma((d+1)/2)+lgamma(i+1.5)-lgamma(i+d/2+1)) return(c(A[k+1],sum(A))) }
(c)
for $a = (1,2)^T,d=1$
a <- c(1,2) d <- 1 res1 <- cal2(a,d,10,10) res1 res2 <- cal2(a,d,100,100) res2 res3 <- cal2(a,d,200,200) res3
由结果知,第201项已经足够小,和为1.813545.
integrand <- function(u,k0){ (1+(u^2)/(k0-1))^(-k0/2) } g <- function(a,k){ up <- sqrt(a^2 * (k-1)/(k-a^2)) inte <- integrate(integrand,lower=0,upper=up,k0 = k)$value (2*gamma(k/2)/(sqrt(pi*(k-1))*gamma((k-1)/2)))*inte } f <- function(a,k){ g(a,k+1)-g(a,k) } f1 <- function(a,k){ C <- numeric(length(a)) for(i in 1:length(a)){ C[i] <- f(a[i],k) } return(C) } k <- c(16:25,50,100,300) a0 <- seq(0, 4, by=0.01) plot(a0, f1(a0, k[1]), lty=1, col=1, type="l", xlim=c(0, 4),xlab="a", ylab="f(a|k)", main="f(a) with different k") lines(a0, f1(a0, k[7]), xlim = c(0, 4), lty=2, col=2) legend("topright", legend=c("k=16", "k=25"), col=1:2,lty=1:2)
可知,根在1,2之间。
sol <- function(k1){ m <-uniroot(f,k=k1,lower = 1,upper = 2)$root m } B <- numeric(length(k)) for (i in 1:length(k)){ B[i] <- sol(k[i]) } # 11.4 S = function(a,k){ ck = sqrt(a^2*k/(k+1-a^2)) pt(ck,df=k,lower.tail=FALSE) } solve = function(k){ output = uniroot(function(a){S(a,k)-S(a,k-1)},lower=1,upper=2) output$root } root = matrix(0,3,length(k)) for (i in 1:length(k)){ root[2,i]=round(solve(k[i]),4) } root[1,] = k root[3,] = B rownames(root) = c('k','A(k)','B(k)') root
相同的k,11.4中的$A(k)$与本题中的$B(k)$基本相同。
MLE:
LL <- function(lambda, y) { f <- numeric(length(y)) for (i in 1:length(y)){ if(y[i]==1){ f[i] <- 1-pexp(y[i],rate=lambda) }else{ f[i] <- dexp(y[i],rate=lambda) } } return( -sum(log(f))) } y <- c(0.54, 0.48, 0.33, 0.43, 1.00, 1.00, 0.91, 1.00, 0.21, 0.85) m <- 2000 lambda <- 1/mean(y) opt <- optim(lambda, LL, y=y) MLE <- opt$par MLE
EM: 令缺失值为$y=(y_1,y_2,y_3)$ E step:
$$l_{o}(\lambda, y)=10 \log \lambda-\lambda \sum_{i=1}^{7} t_{i}-\lambda \sum_{j=1}^{3} y_{i}$$
$E l_{o}\left(\lambda_{0}, y\right)=10 \log \lambda-\lambda \sum_{i=1}^{7} t_{i}-3 \lambda E\left(y \mid \lambda_{0}\right)$
$=10 \log \lambda-\lambda \sum_{i=1}^{7} t_{i}-3\lambda \frac{\lambda_{0}+1}{\lambda_{0}}$
M step:
$\lambda_{1}=\arg \max {\lambda} l(\lambda)=\frac{10}{\sum{i=1}^{7} t_{i}+3 \frac{\lambda_{0}+1}{\lambda_{0}}}$
$\hat{\lambda}=\frac{10}{\beta+3 \frac{\lambda_{0}+1}{\lambda_{0}}}$
N <- 10000 #max. number of iterations y <- c(0.54, 0.48, 0.33, 0.43, 1.00, 1.00, 0.91, 1.00, 0.21, 0.85) L <- 1/mean(y) #initial est. for lambdas tol <- .Machine$double.eps^0.5 L.old <-L+1 for(i in 1:length(y)){ if(y[i]==1){ y[i] <- 0 } } beta0 <- sum(y) l1 <- length(y) for (j in 1:N) { L <- l1/ (beta0+3*(L+1)/L) if (sum(abs(L - L.old)/L.old) < tol) break L.old <- L } print(list(lambda = L, iter = j, tol = tol)) EM <- L c(MLE = MLE , EM = EM)
两种方法得到的结果相似。
第一个语句中的每个元素trims都提供给mean()的第二个参数。 第二个语句中,是通过位置匹配发生的,因为mean()的第一个参数(x)是通过lapply()第三个参数中的名称(x)提供的。
对第三题
formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt ) ex3la <- lapply(formulas, lm, data = mtcars) ex3lf <- vector("list", length(formulas)) for (i in seq_along(formulas)){ ex3lf[[i]] <- lm(formulas[[i]], data = mtcars) } rsq <- function(mod) summary(mod)$r.squared sapply(ex3la,rsq) sapply(ex3lf,rsq)
第四题
bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] }) ex4la <- lapply(bootstraps, lm, formula = mpg ~ disp) ex4lf <- vector("list", length(bootstraps)) for (i in seq_along(bootstraps)){ ex4lf[[i]] <- lm(mpg ~ disp, data = bootstraps[[i]]) } rsq <- function(mod) summary(mod)$r.squared sapply(ex4la, rsq) sapply(ex4lf, rsq)
library(microbenchmark) library(Rcpp) set.seed(12121) cppFunction( 'NumericMatrix gibbsC(int N, int a, int b, int n) { NumericMatrix mat(N, 2); double x = 0, y = 0; mat(0,0) = 0; mat(0,1) = 0.5; for(int i = 1; i < N; i++) { x = rbinom(1, n, mat(i-1,1))[0]; mat(i, 0) = x; y = rbeta(1, x+a, n-x+b)[0]; mat(i, 1) = y; } return(mat); } ') gibbsR <- function(N, a, b, n){ X <- matrix(0, N, 2) X[1,] <- c(0,0.5) for(i in 2:N){ X2 <- X[i-1, 2] X[i,1] <- rbinom(1,n,X2) X1 <- X[i,1] X[i,2] <- rbeta(1,X1+a,n-X1+b) } return(X) } N <- 5000 C <- gibbsC(N,1,1,25) R <- gibbsR(N,1,1,25) Cg <- C[1001:N,] Rg <- R[1001:N,] a <- ppoints(100) QC1 <- quantile(Cg[,1], a) QR1 <- quantile(Rg[,1], a) QC2 <- quantile(Cg[,2], a) QR2 <- quantile(Rg[,2], a) qqplot(QC1, QR1, main="", xlab="C.gibbs1 Quantiles", ylab="R.gibbs1 Quantiles") abline(0,1,col = "red") qqplot(QC2, QR2, main="", xlab="C.gibbs2 Quantiles", ylab="R.gibbs2 Quantiles") abline(0,1,col = "red") time <- microbenchmark(gibbsC(N,1,1,25),gibbsR(N,1,1,25)) summary(time)[,c(1,3,5,6)]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.