knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(StatComp21097)
I have browsed this book called "R for Beginners".
1.texts
Italic, Bold, Bold italic, ~~Strikethrough~~, Baidu, Question
$$x=\frac{-b\pm\sqrt{b^2-4ac}}{2a}$$
2.figures
(1)plot
library(scatterplot3d) set.seed(1104) ma <- as.data.frame(matrix(runif(3*50),nrow=50, ncol=3)) colnames(ma) <- c("X1","X2","X3") scatterplot3d(ma$X1,ma$X2,ma$X3,pch=20,highlight.3d=TRUE,xlab="X1", ylab="X2", zlab="X3", main = "3d-plot")
(2)Insert picture from document
# [IU_LILAC](./IU_LILAC.JPG) # # \begin{figure} # \centering # \includegraphics[width=0.4\textwidth]{./IU_LILAC.JPG} # \end{figure}
3.tables
(1)draw tables with dataframe
data <- data.frame(month = c(3,2,4,5,2,5,8,6,7,4), times = c(34,28,35,18,33,42,56,10,22,51), sex = c('M','F','M','M','F','M','F','F','F','M'),row.names = paste0("student" , 1:10)) data
(2)beautify the table
library(knitr) kable(data)
(3)manually draw the table
||month|times|sex| |:----:|:----:|:----:|:----:| |student1|3|34|M| |student2|2|28|F| |student3|4|35|M| |student4|5|18|M| |student5|8|33|F|
I use the inverse transform method to generate random samples $X = {x_1,x_2,\dots,x_n}$ from a Rayleigh($\sigma$) distribution
First, find the cdf of X:
$$ F(x) = \int_0^xf(u)du \=\int_0^x\frac{u}{\sigma^2}e^{-u^2/(2\sigma^2)}du \=\int_0^xe^{-u^2/(2\sigma^2)}d(u^2/(2\sigma^2)) \=\int_0^{x^2/(2\sigma^2)}e^{-v}dv \=1-e^{-x^2/(2\sigma^2)} $$ According to inverse transform method, let $U = F(x)\sim U(0,1)$ and then $X = F^{-1}(U)= \sqrt{-2\sigma^2ln(1-U)}\sim f(x)$
Use code to generate:
Rayleigh <- function(sigma, n = 1e5){ u <- runif(n) x <- sqrt(-2*sigma^2*log(1-u)) hist(x, prob = TRUE, main = paste0('Rayleigh sigma=',sigma)) y <- seq(0,6, .01) lines(y, y/sigma^2*exp(-y^2/(2*sigma^2))) } Rayleigh(0.2) Rayleigh(0.8) Rayleigh(1.5)
create $X = {x_1,x_2,\dots,x_n}$ to satisfy the Topic requirements
mixture <- function(p1, n = 1e4){ x1 = rnorm(n, 0, 1) x2 = rnorm(n, 3, 1) r = rbinom(n, 1, p1) X = r*x1 + (1-r)*x2 hist(X,breaks = 30) } mixture(0.80) mixture(0.75) mixture(0.66) mixture(0.5)
The closer the value of p1 is to 0.5, the more obvious the bimodal.
simulate compound Poisson(λ)–Gamma process (Y has a Gamma distribution)
let $Y_i \sim Gamma(r,\beta)$, $N(t) \sim Poisson(\lambda t)$
Poissongammar_t <- function(r, beta, lambda, t, n = 1e4){ Nt = rpois(n, lambda*t) Xt = list() for(i in 1:n){ Xt = append(Xt,sum(rgamma(Nt[i],r,beta))) } return(unlist(Xt)) } X3 = Poissongammar_t(2,3,4,t=3) hist(X3,breaks = 30) X6 = Poissongammar_t(4,2,5,t=6) hist(X6,breaks = 30) X10 = Poissongammar_t(1,1,2,t=10) hist(X10,breaks = 30)
Consider the mean and variance of X(10), let $r=2,\beta=4,\lambda=3$
Theoretically: $E(X(t)) = E[E(X(t)|N(t))]=E(E(Y_i)N(t))=E(Y_i)E(N(t))=\lambda tE(Y_i)=\lambda tr/\beta=3102/4=15$ $$ Var(X(t)) = E[X^2(t)]-E^2(X(t)) =E[X^2(t)]-\lambda^2t^2(E(Y_i))^2 =E{E[X^2(t)|N(t)]}-\lambda^2t^2(E(Y_i))^2 \=E[N(t)E(Y_i)^2+(N(t)^2-N(t))E^2Yi]-\lambda^2t^2(E(Y_i))^2 =\lambda tEYi^2 + E^2Y_i(\lambda^2 t^2)-\lambda^2t^2(E(Y_i))^2 =\lambda t EY_i^2=\lambda tr*(r+1)/\beta^2=11.25 $$
Experimentallly:
X10 = Poissongammar_t(2,4,3,t=10) mean(X10) var(X10)
It can be seen that the mean and variance estimates generated by the algorithm are very similar to the theoretical ones.
The density function of Beta(3,3) is $\frac{x^2(1-x)^2}{\int_0^1u^2(1-u)^2du}$
So the Beta(3,3) cdf can be written as $F(x)= \int_0^x\frac{X^2(1-X)^2}{\int_0^1u^2(1-u)^2du}dX=E(\frac{xX^2(1-X)^2}{\int_0^1u^2(1-u)^2du})=\frac{x}{\int_0^1u^2(1-u)^2du}E(X^2(1-X)^2),X \sim U(0,x)$
As for $A := \int_0^1u^2(1-u)^2du = E(U^2(1-U)^2) = 1/30,U\sim U(0,1)$
n <- 1e7 U <- runif(n,0,1) # estimate A A <- mean(U^2*(1-U)^2) A # computing A A = 1/30 A cdf = function(x){ X <- runif(n,0,x) cdfx <- x/A*mean(X^2*(1-X)^2) return(cdfx) } x = 0.1*c(1:9) x result <- list() for(i in x){ result <- append(result, cdf(i)) } round(unlist(result),5) pbeta(x,3,3)
The estimate is very accurate.
I use the inverse transform method to generate random samples $X = {x_1,x_2,\dots,x_n}$ from a Rayleigh($\sigma$) distribution
First, find the cdf of X:
$$ F(x) = \int_0^xf(u)du \=\int_0^x\frac{u}{\sigma^2}e^{-u^2/(2\sigma^2)}du \=\int_0^xe^{-u^2/(2\sigma^2)}d(u^2/(2\sigma^2)) \=\int_0^{x^2/(2\sigma^2)}e^{-v}dv \=1-e^{-x^2/(2\sigma^2)} $$ According to inverse transform method, let $U = F(x)\sim U(0,1)$ and then $X = F^{-1}(U)= \sqrt{-2\sigma^2ln(1-U)}\sim f(x)$
The function can be written as:
Rayleigh <- function(sigma, n = 1e5,seed = 123){ set.seed(seed) u <- runif(n) x <- sqrt(-2*sigma^2*log(1-u)) #hist(x, prob = TRUE, main = paste0('Rayleigh sigma=',sigma)) #y <- seq(0,6, .01) #lines(y, y/sigma^2*exp(-y^2/(2*sigma^2))) return(x) }
When using antithetic variables:$X = F^{-1}(U)= \sqrt{-2\sigma^2lnU}\sim f(x)$
Rayleigh_anti <- function(sigma, n = 1e5,seed=123){ set.seed(seed) u <- runif(n) x <- sqrt(-2*sigma^2*log(u)) #hist(x, prob = TRUE, main = paste0('Rayleigh sigma=',sigma)) #y <- seq(0,6, .01) #lines(y, y/sigma^2*exp(-y^2/(2*sigma^2))) return(x) }
We can generate the same values of U by the same seed.
X1 = Rayleigh(2,n=1e4,seed=123) X2 = Rayleigh(2,n=1e4,seed=1234) X = Rayleigh(2,n=1e4,seed=123) X_ = Rayleigh_anti(2,n=1e4,seed=123) p <- var(1/2*(X+X_))/var(1/2*(X1+X2)) paste0('Variance reduction is ',(1-p))
Let $f_1=e^{-x+1}, x>1$,$f_2=\frac{2}{\sqrt{2\pi}}e^{-\frac{(x-1)^2}{2}},x>1$
$f_1$: $\int_1^\infty\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2} dx= \int_1^\infty\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2+x-1}f_1dx=E(\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2+x-1}),X\sim f_1$
$f_2$: $\int_1^\infty\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2} dx=\int_1^\infty\frac{x^2}{2}e^{-x+1/2}f_2dx = E(\frac{x^2}{2}e^{-x+1/2}),X\sim f_2$
n <- 1e5 X <- rexp(n)+1 # f1 f1 = X^2/sqrt(2*pi)*exp(-X^2/2+X-1) mean(f1) var(f1)/n X <- abs(rnorm(n,0,1))+1 # f2 f2 = X^2/2*exp(-X+1/2) mean(f2) var(f2)/n
The two estimates of the integral in the question are very similar, but the variance of using f2 is much smaller than that of using f1. Because f2 is more similar as $\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}=: B$, $B/f_2$ are more closed to some constant.
I use the importance function $f_2=\frac{2}{\sqrt{2\pi}}e^{-\frac{(x-1)^2}{2}},x>1$ proposed in 5.13 to estimate the integral in question5.14.
$\int_1^\infty\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2} dx=\int_1^\infty\frac{x^2}{2}e^{-x+1/2}f_2dx = E(\frac{x^2}{2}e^{-x+1/2}),X\sim f_2$
n <- 1e5 X <- abs(rnorm(n,0,1))+1 # f2 f2 = X^2/2*exp(-X+1/2) mean(f2)
$\frac{\bar{x}-\mu}{S/\sqrt{n}}\sim t(n-1)=t(20-1)=t(19)$
$$ \frac{\bar{x}-\mu}{S/\sqrt{n}}\sim t(19) $$ so t-interval can be written as $(\bar{x}-S/\sqrt{n}t_{0.975}(19)),\bar{x}+S/\sqrt{n}t_{0.975}(19)))$ theoretically, $\mu = E(\chi^2(2))=2$
n <- 20 alpha <- 0.05 CI <- replicate(1000,expr = { x <- rchisq(n, df = 2) c(mean(x)-sd(x)/sqrt(n)*qt(1-alpha/2, df=n-1),mean(x)+sd(x)/sqrt(n)*qt(1-alpha/2, df=n-1)) }) mean(CI[1,] <= 2 & CI[2,] >= 2)
As for estimate in example 6.4,the interval is $(0,(n-1)s^2/\chi^2_{\alpha})$
n <- 20 alpha <- 0.05 UCL <- replicate(1000,expr = { x <- rchisq(n, df=2) (n-1)*var(x)/qchisq(alpha, n-1) }) mean(UCL > 4)
the result of t-interval is closer to the 0.95 than the result using the method in the example 6.4.
$T = \frac{\bar{X-\mu_0}}{S/\sqrt{n}}\sim t(n-1),n = 50$
(1)$\chi^2(1)$
$\mu_0=E(\chi^{1})=1$
n <- 50 alpha <- 0.05 mu0 = 1 df <- 1 m <- 10000 p <- numeric(m) for (j in 1:m){ x <- rchisq(n,df=df) ttest <- t.test(x, alternative = "two.sided", mu=mu0) p[j] <- ttest$p.value } p.hat <- mean(p < alpha) p.hat
(2)$U(0,2)$
$\mu_0 = E[U(0,2)]=1$
n <- 50 alpha <- 0.05 mu0 = 1 m <- 10000 p <- numeric(m) for (j in 1:m){ x <- runif(n,0,2) ttest <- t.test(x, alternative = "two.sided", mu=mu0) p[j] <- ttest$p.value } p.hat <- mean(p < alpha) p.hat
(3)$Exp(1)$
$\mu_0 = E[exp(1)]=1$
n <- 50 alpha <- 0.05 mu0 = 1 rate <- 1 m <- 10000 p <- numeric(m) for (j in 1:m){ x <- rexp(n,rate = rate) ttest <- t.test(x, alternative = "two.sided", mu=mu0) p[j] <- ttest$p.value } p.hat <- mean(p < alpha) p.hat
We can see these three results is very close to 0.05, so The t-test is robust to mild departures from normality.
(1)
Let the powers of one method are $P1=(P_{11},P_{12},\dots,P_{1k})$, the powers of the other method are $P2=(P_{21},P_{22}, \dots, P_{2k})$, the corresponding hypothesis test problem is H0:the powers of method1($P_1$) = the powers of method2($P_2$), H1: the powers of method1($P_1$) \neq the powers of method2($P_2$), at 0.05 level.
(2)
we should use two-sample t-test.
(3)
We need to use two methods to obtain K powers in 10,000 experiments, then each method has a piece of data of K powers. Let the powers of one method are $P1=(P_{11},P_{12},\dots,P_{1k})$, the powers of the other method are $P2=(P_{21},P_{22}, \dots, P_{2k})$. Then we can test my hypothesis.
$H_0:\beta_{1,d} = 0, H_1:\beta_{1,d} \neq0$
$b_{1,d} = \frac{1}{n^2}\sum_{i,j=1}^{n}((X_i-\bar{X})^T\hat{\sum}^{-1}(X_j-\bar{X}))^3; nb_{1,d}/6 \sim \chi^2_{(d(d+1)(d+2)/6)}$
example 6.8
library(MASS) n <- c(10,20,30,50,100,500) # sample size m <- 1000 # calculate the b1d b1d <- function(x,n){ xbar <- colMeans(x) Cov <- (n-1)/n*cov(x) b <- 0 Xbar <- t(matrix(rep(xbar,n),d,n)) b <- sum(((x-Xbar) %*% solve(Cov) %*% t(x-Xbar))^3) return(b/n^2) }
# d = 1 d = 1 int1 <- qchisq(0.975, df = d*(d+1)*(d+2)/6) int2 <- qchisq(0.025, df = d*(d+1)*(d+2)/6) power <- numeric(length(n)) # estimation for(i in 1:length(n)){ test <- numeric(n[i]) for (j in 1:m){ x <- mvrnorm(n[i],numeric(d), diag(d)) statis <- b1d(x,n[i])*n[i]/6 test[j] <- as.integer(statis > int1 | statis < int2) } power[i] <- mean(test) } #power power <- data.frame(n = n, estimate = power) power
# d = 2 d = 2 int1 <- qchisq(0.975, df = d*(d+1)*(d+2)/6) int2 <- qchisq(0.025, df = d*(d+1)*(d+2)/6) power <- numeric(length(n)) # estimation for(i in 1:length(n)){ test <- numeric(n[i]) for (j in 1:m){ x <- mvrnorm(n[i],numeric(d), diag(d)) statis <- b1d(x,n[i])*n[i]/6 test[j] <- as.integer(statis >= int1 | statis <= int2) } power[i] <- mean(test) } #power power <- data.frame(n = n, estimate = power) power
power都接近0.05符合预期。
example 6.10
# d = 1 d = 1 n <- 30 m <- 1000 epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05)) N <- length(epsilon) power <- numeric(N) int1 <- qchisq(0.95, df = d*(d+1)*(d+2)/6) int2 <- qchisq(0.05, df = d*(d+1)*(d+2)/6) # estimation for(i in 1:N){ epsi <- epsilon[i] test <- numeric(m) for(j in 1:m){ sigma <- sample(c(1,10), replace = TRUE, size = n, prob = c(1-epsi,epsi)) # the number of sigma = 1(n1) and sigma = 10(n2) n1 <- sum(sigma==1) n2 <- n-n1 if(n1 != 0 & n2 !=0){ x1 <- mvrnorm(n1, numeric(d), diag(d)*(1^2)) x2 <- mvrnorm(n2, numeric(d), diag(d)*(10^2)) x <- rbind(x1,x2) } else if (n1 == 0){ x <- mvrnorm(n2, numeric(d), diag(d)*(10^2)) } else { x <- mvrnorm(n1, numeric(d), diag(d)*(1^2)) } statis <- b1d(x,n)*n/6 test[j] <- as.integer(statis >= int1 | statis <= int2) } power[i] <- mean(test) } # power p <- data.frame(epsilon = epsilon, estimate = power) p # plot power-epsi plot(epsilon, power, type = "b", xlab = bquote(epsilon), ylim = c(0,1)) abline(h = 0.1, lty = 3) se <- sqrt(power * (1-power)/m) lines(epsilon, power+se, lty = 3) lines(epsilon, power-se, lty = 3)
# d = 2 d = 2 n <- 30 m <- 1000 epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05)) N <- length(epsilon) power <- numeric(N) int1 <- qchisq(0.95, df = d*(d+1)*(d+2)/6) int2 <- qchisq(0.05, df = d*(d+1)*(d+2)/6) # estimation for(i in 1:N){ epsi <- epsilon[i] test <- numeric(m) for(j in 1:m){ sigma <- sample(c(1,10), replace = TRUE, size = n, prob = c(1-epsi,epsi)) # the number of sigma = 1(n1) and sigma = 10(n2) n1 <- sum(sigma==1) n2 <- n-n1 if(n1 != 0 & n2 !=0){ x1 <- mvrnorm(n1, numeric(d), diag(d)*(1^2)) x2 <- mvrnorm(n2, numeric(d), diag(d)*(10^2)) x <- rbind(x1,x2) } else if (n1 == 0){ x <- mvrnorm(n2, numeric(d), diag(d)*(10^2)) } else { x <- mvrnorm(n1, numeric(d), diag(d)*(1^2)) } statis <- b1d(x,n)*n/6 test[j] <- as.integer(statis >= int1 | statis <= int2) } power[i] <- mean(test) } # power p <- data.frame(epsilon = epsilon, estimate = power) p # plot power-epsi plot(epsilon, power, type = "b", xlab = bquote(epsilon), ylim = c(0,1)) abline(h = 0.1, lty = 3) se <- sqrt(power * (1-power)/m) lines(epsilon, power+se, lty = 3) lines(epsilon, power-se, lty = 3)
从图中可以看出$\epsilon$接近0或者1的时候power接近0.1,可以在一定$\alpha = 0.1$的显著性条件下认为是对称的。而且power呈上升再下降的趋势,说明在起初$\epsilon$变大的时候,生成的x的分布越偏离对称,然后随着$\epsilon$继续变大,power减小接近0.1,说明生成的分布逐渐接近对称。
library(bootstrap) data(scor) X <- scor
theta <- function(X,i){ Cov = (n-1)/n*cov(X[i,]) E <- eigen(Cov) lam <- E$values theta <- max(lam)/sum(lam) return(theta) }
library(boot) n <- nrow(X) boots <- boot(data = X, statistic = theta,R =2000) round(c(original = boots$t0, boot.bias= mean(boots$t)-boots$t0, boot.se = sd(boots$t)),3)
n <- nrow(X) theta.hat <- theta(X) theta.jack <- numeric(n) for(i in 1:n){ theta.jack[i] <- theta(X[-i,]) } #theta.jack jack.bias <- (n-1)*(mean(theta.jack)-theta.hat) jack.se <- sqrt((n-1)*mean((theta.jack-theta.hat)^2)) round(c(original=theta.hat,jack.bias = jack.bias, jack.se=jack.se),3)
boots <- boot(data = X, statistic = theta,R =2000) ci <- boot.ci(boots,0.95,type=c("perc","bca")) print("percentile.ci:") print(ci$percent[4:5]) print("Bca.ci:") print(ci$bca[4:5])
skewness: $\sqrt{\beta_1}=\frac{E[(X-\mu_X)]^3}{\sigma_X^3}$ can be estimated as $\sqrt{b_1}= \frac{\frac{1}{n}\sum_{i=1}^n(X_i-\bar{X})^3}{(\frac{1}{n}\sum_{i=1}^n(X_i-\bar{X})^2)^{3/2}}$
skew <- function(x,i){ xbar <- mean(x[i]) t1 <- mean((x[i]-xbar)^3) t2 <- mean((x[i]-xbar)^2) return(t1/t2^(3/2)) }
normal:
library(boot) n <- 20 set.seed(123) m <- 2000 ci.norm <- matrix(NA,m,2) ci.basic <- matrix(NA,m,2) ci.perc <- matrix(NA,m,2) for(i in 1:m){ X <- rnorm(n) theta.hat <- skew(X) boots <- boot(data=X, statistic=skew,R = 500) ci <- boot.ci(boots, 0.95, type = c("norm","basic","perc")) ci.norm[i,] <- ci$norm[2:3] ci.basic[i,] <- ci$basic[4:5] ci.perc[i,] <- ci$percent[4:5] }
cat('miss.norm:', mean(ci.norm[,1] >= 0 |ci.norm[,2] <= 0)) cat('miss.basic:', mean(ci.basic[,1] >= 0 |ci.basic[,2] <= 0)) cat('miss.percent:', mean(ci.perc[,1] >= 0 |ci.perc[,2] <= 0))
The result of normal is closed to 0.05, which can validate the skewness of nomal is close to 0
$\chi^2(5)$
library(boot) n <- 20 set.seed(123) m <- 2000 ci.norm <- matrix(NA,m,2) ci.basic <- matrix(NA,m,2) ci.perc <- matrix(NA,m,2) for(i in 1:m){ X <- rchisq(n,df=5) theta.hat <- skew(X) boots <- boot(data=X, statistic=skew,R = 500) ci <- boot.ci(boots, 0.95, type = c("norm","basic","perc")) ci.norm[i,] <- ci$norm[2:3] ci.basic[i,] <- ci$basic[4:5] ci.perc[i,] <- ci$percent[4:5] }
cat('miss.norm:', mean(ci.norm[,1] >= 0 |ci.norm[,2] <= 0)) cat('miss.basic:', mean(ci.basic[,1] >= 0 |ci.basic[,2] <= 0)) cat('miss.percent:', mean(ci.perc[,1] >= 0 |ci.perc[,2] <= 0))
The result of chi2 is far from 0.05, so statisfied the skewness of chi2>0
library(boot) library(RANN) #library(energy) 服务器里面energy包问题 library(Ball)
# permutation n1 <- 8 n2 <- 8 set.seed(223344) x <- rnorm(n1,2,6) y <- rnorm(n2,3,5) Z <- c(x,y) Rep <- 1002 result <- numeric(Rep) t0 <- cor(x, y, method = "spearman") for(i in 1:Rep){ index <- sample(n1+n2,size = n1, replace= FALSE) xx <- Z[index] yy <- Z[-index] result[i] <- cor(xx,yy,method="spearman") } p.value <- mean(abs(c(t0, result)) >= abs(t0)) cat("P.values of Permuation is",p.value)
# cor.test Test = cor.test(x,y,alternative = 'two.sided', method = "spearman") p.value <- Test$p.value cat("P.values of cor.test is",p.value)
这两种方法的p.value结果很接近。
power comparison
alpha = 0.1
define the nntest
# define the nntest # NN NN <- function(z, i, sizes, K){ n1 <- sizes[1] n2 <- sizes[2] n <- sum(sizes) if(is.vector(z)){ z <- data.frame(z,0) } NN <- nn2(data = z[i,], k = K+1) B1 <- NN$nn.idx[1:n1,-1] B2 <- NN$nn.idx[(n1+1):n,-1] i1 <- sum(B1 < n1 + 0.5) i2 <- sum(B2 > n1 + 0.5) return((i1 + i2)/(K*n)) } nn.test <- function(z, n, k){ boot1 <- boot(data = z, statistic = NN, R = 800, sim = "permutation", sizes = N, K = k) return(mean(c(boot1$t0,boot1$t) >= boot1$t0)) }
1)Unequal variances and equal expectations
$X\sim N(0,2), Y\sim N(0,1), len(X) = n1 = 10, len(Y) = n2 = 15$
# Unequal variances and equal expectations # Rep <- 888 # set.seed(445566) # n1 <- 10 # n2 <- 15 # N <- c(n1,n2) # k <- 3 # p1 <- numeric(Rep) # p2 <- numeric(Rep) # p3 <- numeric(Rep) # for(i in 1:Rep){ # x <- rnorm(n1, 0, 2) # y <- rnorm(n2, 0, 1) # Z <- c(x,y) # p1[i] <- nn.test(Z, N, k) # p2[i] <- eqdist.etest(Z, sizes= N, R = 800)$p.value # p3[i] <- bd.test(x = x, y = y, num.permutations=800)$p.value # } # cat('powers of NN is' ,mean(p1<alpha)) # cat('powers of Energy is' ,mean(p2<alpha)) # cat('powers of Ball is' ,mean(p3<alpha))
输出为:0.3018018,0.3209459,0.3918919
2)Unequal variances and unequal expectations
$X \sim N(0,2), Y \sim N(1,1), len(X) = n1 = 10, len(Y) = n2 = 15$
# Unequal variances and unequal expectations # Rep <- 888 # set.seed(445566) # n1 <- 10 # n2 <- 15 # N <- c(n1,n2) # k <- 3 # p1 <- numeric(Rep) # p2 <- numeric(Rep) # p3 <- numeric(Rep) # for(i in 1:Rep){ # x <- rnorm(n1, 0, 2) # y <- rnorm(n2, 1, 1) # Z <- c(x,y) # p1[i] <- nn.test(Z, N, k) # p2[i] <- eqdist.etest(Z, sizes= N, R = 800)$p.value # p3[i] <- bd.test(x = x, y = y, num.permutations=800)$p.value # } # cat('powers of NN is' ,mean(p1<alpha)) # cat('powers of Energy is' ,mean(p2<alpha)) # cat('powers of Ball is' ,mean(p3<alpha))
输出为:0.4684685,0.6497748,0.5945946
3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
$X \sim t(1), Y \sim 0.5N(0,1)+(1-0.5)N(0,10), len(X) = n1 = 10, len(Y) = n2 = 15$
# Non-normal distributions # Rep <- 888 # set.seed(445566) # n1 <- 10 # n2 <- 15 # N <- c(n1,n2) # k <- 3 # p1 <- numeric(Rep) # p2 <- numeric(Rep) # p3 <- numeric(Rep) # for(i in 1:Rep){ # x <- rt(n1, df=1) # # sigma = 1,10、epsi = 0.5 # sig110 = c(1,10) # sig = sig110[rbinom(n2,1,0.5)+1] # y <- rnorm(n2, 0, sig) # Z <- c(x,y) # p1[i] <- nn.test(Z, N, k) # p2[i] <- eqdist.etest(Z, sizes= N, R = 800)$p.value # p3[i] <- bd.test(x = x, y = y, num.permutations=800)$p.value # } # cat('powers of NN is' ,mean(p1<alpha)) # cat('powers of Energy is' ,mean(p2<alpha)) # cat('powers of Ball is' ,mean(p3<alpha))
输出为:0.1306306,0.1103604,0.2150901
4)Unbalanced samples (say, 1 case versus 10 controls)(same)
$X \sim N(0,1), Y \sim N(0,1), len(X) = n1 = 1, len(Y) = n2 = 10$
# Non-normal distributions # Rep <- 888 # set.seed(445566) # n1 <- 1 # n2 <- 10 # N <- c(n1,n2) # k <- 3 # p1 <- numeric(Rep) # p2 <- numeric(Rep) # p3 <- numeric(Rep) # for(i in 1:Rep){ # x <- rnorm(n1, 1, 1) # y <- rnorm(n2, 1, 1) # Z <- c(x,y) # p1[i] <- nn.test(Z, N, k) # p2[i] <- eqdist.etest(Z, sizes= N, R = 800)$p.value # p3[i] <- bd.test(x = x, y = y, num.permutations=800)$p.value # } # cat('powers of NN is' ,mean(p1<alpha)) # cat('powers of Energy is' ,mean(p2<alpha)) # cat('powers of Ball is' ,mean(p3<alpha))
输出为:0.0518018,0.1295045,0.1013514
这四种情况下energy和ball这两种检验方法的power比第一种更高,表现更好。
Let g(·|Xt) = $N(Xt,4)$, and generate X0 with $N(0,4)$ and store in x[1]
set.seed(123) n <- 10000 x <- numeric(n) # generate x0 with N(0,4) x[1] <- rnorm(1,mean=0,sd=5) # start loop and compare for(i in 2:n){ Y <- rnorm(n = 1, mean= x[i-1], sd=4) # generate U randomly with runif U <- runif(1) Stats <- dcauchy(Y)*dnorm(x[i-1],mean=Y,sd=4)/(dcauchy(x[i-1])*dnorm(Y,mean=x[i-1],sd=4)) #print(Stats) if(Stats >= U){ x[i] = Y }else{ x[i] = x[i-1] } }
# plot the chain discard the fisrt 1000 of chain plot(1001:n, x[1001:n],type="l",ylab="x")
qcauchy(0.1*c(1:9)) quantile(x[1001:n],probs = 0.1*c(1:9))
结果十分相近。
Let a=2,b=3,n=5 for example
m = 5000 x = matrix(0,m,2) a = 2 b = 3 n = 5 x[1,] <- c(2.5,0.5) # f(x|y) ~ binomial(n,y) # f(y|x) ~ Beta(x+a,n-x+b) for(i in 2:m){ x[i,1] <- rbinom(1,n,x[i-1,2]) x[i,2] <- rbeta(1,x[i-1,1]+a,n-x[i-1,1]+b) }
# plot plot(1:(m-1000),x[1001:m,1],type='l',col="red",ylab="x:red,y:black") lines(x[1001:m,2]) # show the distribution plot(x[1001:m,], xlab = 'x',ylab='y')
For each of the above exercise, use the Gelman-Rubin method to monitor convergence of the chain, and run the chain until it converges approximately to the target distribution according to Rˆ < 1.2.
9.3:
# Gelman-Rubin CR <- function(PSI,n,k){ # n col-number; k row-number PSIi_bar <- rowMeans(PSI) PSIij_bar <- mean(PSI) Bn <- n/(k-1)*sum((PSIi_bar-PSIij_bar)^2) Wn <- mean(apply(PSI, 1, "var")) Var <- (n-1)/n*Wn + 1/n*Bn R <- Var/Wn return(R) }
set.seed(123) n <- 10000 k <- 4 X <- matrix(0,k,n) for(kk in 1:k){ x <- numeric(n) # generate x0 with N(0,4) x[1] <- rnorm(1,mean=0,sd=4) # start loop and compare for(i in 2:n){ Y <- rnorm(n = 1, mean= x[i-1], sd=4) # generate U randomly with runif U <- runif(1) Stats <- dcauchy(Y)*dnorm(x[i-1],mean=Y,sd=4)/(dcauchy(x[i-1])*dnorm(Y,mean=x[i-1],sd=4)) #print(Stats) if(Stats >= U){ x[i] = Y }else{ x[i] = x[i-1] } } X[kk,] <- x }
to_psi <- function(x){ x/c(1:length(x)) } psi <- t(apply((apply(X,1,cumsum)),2,to_psi))
CR(psi,n,k)
# plot the psi plot((1000+1):n,psi[1, (1000+1):n], ylim=c(-1.5,2.6),type="l", xlab='Index', ylab=bquote(phi)) for(i in 2:4){ lines(psi[i,(1000+1):n],col=i+1) }
#plot the R par(mfrow=c(1,1)) R <- numeric(n) for (j in (1000+1):n){ R[j] <- CR(psi[,1:j],j,k) } plot(R[(1000+1):n], type="l", xlab="", ylab="R") abline(h=1.2, lty=2)
9.8
# Gelman-Rubin CR <- function(PSI,n,k){ # n col-number; k row-number PSIi_bar <- rowMeans(PSI) PSIij_bar <- mean(PSI) #Bn <- n/(k-1)*sum((PSIi_bar-PSIij_bar)^2) Bn <- n*var(PSIi_bar) Wn <- mean(apply(PSI, 1, "var")) Var <- (n-1)/n*Wn + 1/n*Bn R <- Var/Wn return(R) }
set.seed(12354) n <- 20000 #为了减少计算时间 50000 -> 20000 k <- 4 X <- matrix(0,k,n) Y <- matrix(0,k,n) a = 2 b = 3 m = 5 for(kk in 1:k){ x <- matrix(0,n,2) x[1,] <- c(2+0.1*kk,0.5+0.1*kk) # f(x|y) ~ binomial(n,y) # f(y|x) ~ Beta(x+a,n-x+b) for(i in 2:n){ x[i,1] <- rbinom(1,m,x[i-1,2]) x[i,2] <- rbeta(1,x[i-1,1]+a,m-x[i-1,1]+b) } X[kk,] <- x[,1] Y[kk,] <- x[,2] }
to_psi <- function(x){ x/c(1:length(x)) } psi_x <- t(apply((apply(X,1,cumsum)),2,to_psi)) psi_y <- t(apply((apply(Y,1,cumsum)),2,to_psi))
CR(psi_x,n,k) CR(psi_y,n,k)
# plot the psi_x plot((500+1):n,psi_x[1, (500+1):n], ylim=c(1.9,2.2),type="l", xlab='Index', ylab=bquote(phi),main="Y") for(i in 2:4){ lines(psi_x[i,(500+1):n],col=i+1) } # plot the psi_y plot((500+1):n,psi_y[1, (500+1):n], ylim=c(0.38,0.42),type="l", xlab='Index', ylab=bquote(phi),main="Y") for(i in 2:4){ lines(psi_y[i,(500+1):n],col=i+1) }
#plot the R of x par(mfrow=c(1,1)) R <- numeric(n) for (j in (500+1):n){ R[j] <- CR(psi_x[,1:j],j,k) } plot(R[(500+1):n], type="l", xlab="", ylab="R") abline(h=1.2, lty=2) #plot the R of y par(mfrow=c(1,1)) R <- numeric(n) for (j in (500+1):n){ R[j] <- CR(psi_y[,1:j],j,k) } plot(R[(500+1):n], type="l", xlab="", ylab="R") abline(h=1.2, lty=2)
(a)
# jiecheng jc <- function(n){ if(n == 0){ N_ = 1 }else{ N_ = 1 for(i in 1:n){ N_ = N_ * i } } return(N_) } # 第k项结果 kterm <- function(k, a){ t1 <- (-1)^k/jc(k)/2^(k) t2 <- sqrt(sum(a^2))^(2*k+2)/(2*k+1)/(2*k+2) d = length(a) t3 <- gamma((d+1)/2)*gamma(k+3/2)/gamma(k+d/2+1) result <- t1*t2*t3 return(result) }
# example1 k = 0 a = c(2,7,4) kterm(k,a) # example2 k = 2 a = c(9,2,5,4) kterm(k,a)
(b)
ktermsum <- function(a,m){ total = 0 for(k in 0:m){ total = total + kterm(k,a) } return(total) }
(c)
a = c(1,2) ktermsum(a,25) ktermsum(a,50)
k = 25时,和已经收敛。
11.4结果:
FF <- function(a,k){ f <- 1 - pt(sqrt((a^2*k)/(k+1-a^2)),df=k) return(f) } # total function left minus right Flr <- function(a,k){ Ft <- FF(a,k) - FF(a,k-1) return(Ft) } kk <- c(4:25,100,500,1000) Ak <- matrix(nrow=3,ncol=length(kk)) colnames(Ak) <- kk rownames(Ak) <- c('root','f.root','iter') i = 1 for(k in kk){ Ak[,i] <- unlist(uniroot(Flr,k=k,lower=0.0001,upper=sqrt(k-1))[1:3]) i = i+1 } # 11.4 Ak
# integrate(f,0,1) ck <- function(a,k){ return(sqrt(a^2*k/(k+1-a^2))) } # left F_l <- function(a,k){ K <- k c <- 2*exp(log(gamma(k/2))-log(sqrt(pi*(k-1)))-log(gamma((k-1)/2))) Ck <- ck(a,k-1) f1 <- function(u){ f <- (1+u^2/(K-1))^(-K/2) return(f) } inte <- integrate(f1, 0, Ck)$value #inte <- pt(Ck, df = k-1) Fl <- c*inte return(Fl) } # right F_r <- function(a,k){ K <- k c <- 2*exp(log(gamma((k+1)/2))-log(sqrt(pi*k))-log(gamma(k/2))) Ck <- ck(a,k) f2 <- function(u){ f <- (1+u^2/K)^(-(K+1)/2) return(f) } inte <- integrate(f2, 0, Ck)$value #inte <- pt(Ck, df = k) Fr <- c*inte return(Fr) } F_total <- function(a,k){ Ft <- F_l(a,k) - F_r(a,k) return(Ft) }
# define F_l-F_r # k = 4:25,100,500,1000 kk <- c(4:25,100,500,1000) Ak <- matrix(nrow=3,ncol=length(kk)) colnames(Ak) <- kk rownames(Ak) <- c('root','f.root','iter') i = 1 for(k in kk){ # print(k) # find the root of F_total if(k <= 22){ Root <- uniroot(F_total, k = k,interval = c(0.001,sqrt(k-1))) Ak[,i] <- unlist(Root[1:3]) }else if(k <=100){ Root <- uniroot(F_total, k = k,interval = c(3,sqrt(k-1e-5))) Ak[,i] <- unlist(Root[1:3]) }else{ Ak[,i] <- c(NaN,NaN,NaN) } i = i + 1 } F_total(3,500) F_total(6,1000) # 11.5 Ak
从结果比较可以看出在100之前的root几乎相等,但当k = 500和1000的时候,11.5中$\gamma$函数就无法计算出真实值,所以F_total的值成了NaN,11.4中可以正常计算。
$E-step$:$Q(\lambda,\lambda^{(k)})$ = $E[log(\lambda^ne^{\lambda\sum_{i=1}^nT_i})|Y,\lambda^k] =nlog\lambda + \lambda\sum_{i=1}^nE[T_i|Y_i,\lambda^{(k)}]$
其中$\lambda^{(k)}$为第k次迭代的$\lambda$估计值,$E[T_i|Y_i,\lambda^{(k)}] = Y_i,Y_i\leq \tau;\tau + 1/\lambda^{(k)},Y_i>\tau$
代入得$Q(\lambda,\lambda^{(k)}) = nlog\lambda -\lambda(\sum_{Y_i \leq \tau}Y_i+m(\tau+1/\lambda^{(k)})),m = \sum_{Y_i=\tau}1$
$M-step$,Q对$\lambda$求导,并令导数等于0:$Q'(\lambda,\lambda^{(k)})=n/\lambda - (\sum_{Y_i \leq \tau}Y_i+m(\tau+1/\lambda^{(k)})) = 0$,所以得到$\lambda_{k+1} = n/(\sum_{Y_i \leq \tau}Y_i+m(\tau+1/\lambda^{(k)})$
# EM Y <- c(0.54, 0.48, 0.33, 0.43, 1.00, 1.00, 0.91, 1.00, 0.21, 0.85) lambdak <- 1 lambdak1 <- 100 tau <- 1 n <- length(Y) m <- sum(Y==tau) l <- sum(Y[Y<tau]) while(abs(lambdak1-lambdak) > 1e-7){ lambdak <- lambdak1 lambdak1 <- n/(l + m*(tau + 1/lambdak)) } print(lambdak1)
$MLE: logl(\theta) = log(\lambda^pe^{-\lambda\sum_{Y_i<\tau}Y_i}e^{-m\lambda\tau}) = plog\lambda-\lambda \sum_{Y_i<\tau}Y_i - m\lambda \tau,m = \sum_{Y_i=\tau}1,p = \sum_{Y_i<\tau}1$
# mle library(stats4) Y <- c(0.54, 0.48, 0.33, 0.43, 1.00, 1.00, 0.91, 1.00, 0.21, 0.85) tau <- 1 n <- length(Y) m <- sum(Y==tau) p <- sum(Y<tau) l <- sum(Y[Y<tau]) MLE <- function(lambda){ qq <- p*log(lambda)-lambda*l-m*lambda*tau return(qq) } lambda_max <- optimize(MLE,lower = 0.05, upper = 2,maximum = TRUE)$maximum print(lambda_max)
两个方法的结果很相近
Exercise 1
trims <- c(0, 0.1, 0.2, 0.5) x <- rcauchy(100) lapply(trims, function(trim) mean(x, trim = trim)) lapply(trims, mean, x = x)
两个结果一样是因为lapply中有一个参数为“optional arguments to FUN.”,第二种写法中的“x = x”会作为参数补充入mean中,相当于固定了mean的第一个参数为x,mean函数中第二个参数为trim,trims中的数就作为trim参数传入函数中,这样就和第一种写法中的function相同了。
Exercise 5:For each model in the previous two exercises, extract R2 using the function below.
# ex3 # R^2 function rsq <- function(mod){ summary(mod)$r.squared } # define the list of model to fit formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt ) models <- lapply(formulas,lm,data = mtcars) R2 <- lapply(models, rsq) R2
# ex4 bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] }) # fit models <- lapply(bootstraps,function(df){lm(mpg~disp,data=df)}) R2 <- lapply(models, rsq) R2
Exercise 1:
a)
x <- data.frame(matrix(sample(2:30,20,rep=T),5,4)) x
vapply(x, sd,c(1))
b)
X <- data.frame(A = 1:10,B = sample(LETTERS[1:6],10,rep=T),C = runif(10)) X SD <- function(x){ sd(x,na.rm=TRUE) } # method1 vapply(X[,vapply(X,mode,"r")=='numeric'], SD,numeric(1)) # method2 vapply(X[,vapply(X,is.numeric,TRUE)], SD,numeric(1))
Exercise 7:
# definition of sapply mcsapply <- function(X,FYN,...,simplify = TRUE, USE.NAMES = TRUE) { FUN <- match.fun(FUN) answer <- mclapply(X = X, FUN = FUN, ...) if (USE.NAMES && is.character(X) && is.null(names(answer))) names(answer) <- X if (!isFALSE(simplify)) simplify2array(answer, higher = (simplify == "array")) else answer }
从sapply的函数定义可以看出sapply其实是在lapply的基础上做了封装和进一步的加工,所以mcsapply只需要将原本lapply换成mclapply就好了具体的修改如下:
library(parallel) mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE,mc.preschedule = TRUE, mc.set.seed = TRUE, mc.silent = FALSE, mc.cores = getOption("mc.cores", 2L), mc.cleanup = TRUE, mc.allow.recursive = TRUE, affinity.list = NULL) { FUN <- match.fun(FUN) answer <- mclapply(X = X, FUN = FUN, ...,mc.preschedule = mc.preschedule, mc.set.seed = mc.set.seed, mc.silent = mc.silent, mc.cores = mc.cores, mc.cleanup = mc.cleanup, mc.allow.recursive = mc.allow.recursive, affinity.list = affinity.list) if (USE.NAMES && is.character(X) && is.null(names(answer))) names(answer) <- X if (!isFALSE(simplify)) simplify2array(answer, higher = (simplify == "array")) else answer }
# library(parallel) # unlist(mclapply(1:20, sqrt, mc.cores = 4)) # unlist(mcsapply(1:20, sqrt, mc.cores = 4)) # 输出为: # [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427 3.000000 3.162278 3.316625 3.464102 # [13] 3.605551 3.741657 3.872983 4.000000 4.123106 4.242641 4.358899 4.472136 # [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427 3.000000 3.162278 3.316625 3.464102 # [13] 3.605551 3.741657 3.872983 4.000000 4.123106 4.242641 4.358899 4.472136
# definition of vapply vapply <- function (X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE) { FUN <- match.fun(FUN) if (!is.vector(X) || is.object(X)) X <- as.list(X) .Internal(vapply(X, FUN, FUN.VALUE, USE.NAMES)) }
从vapply的定义可以看出,vapply和sapply的定义模式和sapply有本质不同,所以不好定义mcvapply
# Cpp library(Rcpp) #dir_cpp <- '/data/cenmin/statistical computing/Rcpp/' #sourceCpp(paste0(dir_cpp,"GBSampler.cpp")) cppFunction(" NumericMatrix gbsamplerC(int m, int a, int b, int n) { NumericMatrix x(m,2); x(0,0) = 2.5; x(0,1) = 0.5; for(int i=1; i < m; i++){ x(i,0) = R::rbinom(n,x(i-1,1)); x(i,1) = R::rbeta(x(i-1,0)+a,n-x(i-1,0)+b); } return x; } ") set.seed(12) m = 10000 a = 2 b = 3 n = 5 t1 <- proc.time() xC <- gbsamplerC(m,a,b,n) t2 <- proc.time() t <- t2-t1 paste0(t[3][[1]],'s')
# R gbsampler <- function(m,a,b,n){ x <- matrix(0,m,2) x[1,] <- c(2.5,0.5) for(i in 2:m){ x[i,1] <- rbinom(1,n,x[i-1,2]) x[i,2] <- rbeta(1,x[i-1,1]+a,n-x[i-1,1]+b) } return(x) } set.seed(24) t1 <- proc.time() xR <- gbsampler(m,a,b,n) t2 <- proc.time() t <- t2-t1 paste0(t[3][[1]],'s')
利用Rcpp写的函数运行的时间更短。
qqplot(xC[1000:dim(xC)[1],1],xR[1000:dim(xR)[1],1],main='qqplot of X1',ylab="xR-x1",xlab="xC-x1") lines(-1:6,-1:6,col='red',lwd=2) qqplot(xC[1000:dim(xC)[1],2],xR[1000:dim(xR)[1],2],main='qqplot of X2',ylab="xR-x2",xlab="xC-x2") lines(-1:1,-1:1,col='red',lwd=2)
可以看出用Rcpp和原生R语言生成的随机数据分布相同。
比较时间运行速度:
library(microbenchmark) # times <- microbenchmark(timecpp=system.time(gbsamplerC(m,a,b,n)),timeR=system.time(gbsampler(m,a,b,n))) # summary(times) ## 结果输出为: ## expr min lq mean median uq max neval ## 1 timecpp 33.86392 34.17796 35.86211 37.09817 37.41579 38.78914 100 ## 2 timeR 68.21935 68.83497 70.66664 70.87941 71.98854 78.27610 100
可以看出用Cpp写的Gibbs Sampler速度比纯R语言写的更快。
总的来说,利用Rcpp写的函数可以得到利用纯R语言写一样的结果,但是可以用更短的时间得到结果。这对于较大的项目,甚至更大的项目来说可以节省很多时间提高效率。
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.