Use knitr to produce at least 3 examples(texts,figures,tables)
example1 生成10个服从U(0,1)的随机数
x = rnorm(10) print(x)
example2 生成U(0,1),U(0,2)和U(0,0.5)的概率密度函数
x <- seq(-5,5,length.out=100) plot(x,dnorm(x,0,0.5),type='l') lines(x,dnorm(x,0,1)) lines(x,dnorm(x,0,2))
example3
num <- c(1:3) name <- c('张三','李四','王五') sex <- c('女','男','女') x <- data.frame(num,name,sex);x
3.4
n <- 500 u <- runif(n) sigma <- runif(1, 0, 1000) x <- sqrt(-2 * sigma^2 * log(1 - u)) hist(x, prob = TRUE, main=expression("f(x)==x/(sigma^2)*exp(-x^2/(2*(sigma^2)))")) y <- seq(0, 10000, 0.01) lines(y, y/sigma^2*exp(-y^2/(2*sigma^2)))
3.11 有双峰,当r=0.5左右应该最为明显
n <- 10000 X1 <- rnorm(n, 0, 1) X2 <- rnorm(n, 3, 1) r <- sample(c(0, 1), n, replace = TRUE) Z <- r * X1 + (1 - r) * X2 hist(Z) p1 <- 0.75 p2 <- 0.1 p3 <- 0.2 p4 <- 0.3 p5 <- 0.4 p6 <- 0.5 p7 <- 0.6 p8 <- 0.7 p9 <- 0.8 p10 <- 0.9 Z1 <- p1 * X1 + (1 - p1) * X2 Z2 <- p2 * X1 + (1 - p2) * X2 Z3 <- p3 * X1 + (1 - p3) * X2 Z4 <- p4 * X1 + (1 - p4) * X2 Z5 <- p5 * X1 + (1 - p5) * X2 Z6 <- p6 * X1 + (1 - p6) * X2 Z7 <- p7 * X1 + (1 - p7) * X2 Z8 <- p8 * X1 + (1 - p8) * X2 Z9 <- p9 * X1 + (1 - p9) * X2 Z10 <- p10 * X1 + (1 - p10) * X2 hist(Z1); hist(Z2); hist(Z3); hist(Z4); hist(Z5); hist(Z6); hist(Z7); hist(Z8); hist(Z9); hist(Z10)
3.20 不妨设泊松分布中拉姆达为2,伽马分布的参数为(1,2) 第一个思路: (经计算可得,x(10)的均值和方差均为1)
n <- 1000 p <- rpois(n, 2) x <- rgamma(n, 1*p, 2) # 以上为随机生成n个复合分布的数字 #下计算x(10) q <- rpois(n, 10*2) y <- rgamma(n, q*1, 2) print(mean(x)) print(var(x))
第二个思路: 共生成n个数,对于每一个数,均为i(i服从泊松分布)个服从伽马分布的数相加得到(只是一些思路)
n <- 1000 y <- c() for (j in (1:1000)){ x <- rgamma(rpois(1,2),1,2) #print(j) y[j] <- sum(x) #print(sum(x)) } print(mean(y)) print(var(y))
Exercises 5.4 5.9 5.13 and 5.14(Statistical Computating with R)
5.4
fx <- function(i){ m <- 5000 x <- runif(m,0,i) theta <- mean(30*x^2*(1-x)^2*i) print(c(i,theta,pbeta(i,3,3))) } for (j in seq(0.1,0.9,0.1)){ fx(j) }
通过结果可以看出,蒙特卡洛估计十分接近真实值
5.9
fx <- function(x,r=1000,antithetic=TRUE){ u <- runif(r/2) if (! antithetic) v <- runif(r/2) else v <- 1-u u <- c(u,v) cdf <- numeric(length(x)) for (i in 1:length(x)){ g <- x[i]^2*u*exp(-(x[i]*u)^2/2) cdf[i] <- mean(g) } cdf } x <- seq(1,100,20) set.seed(123) fx1 <- fx(x,antithetic=FALSE) set.seed(123) fx2 <- fx(x,antithetic=TRUE) print(round(rbind(x,fx1,fx2),5))
此题选取sigma为1,故g(x)在代码中直接体现带入sigma的方程式。fx1为真实值,fx2为模拟值,选择(1,21,41,61,81),可以看出模拟效果十分良好
5.13
n <- 5000 g <- function(x){ x^2 * exp(-x^2/2) / sqrt(2*pi) * (x > 1) } hat <- se <- numeric(2) x <- rexp(n, 1)+1 fg <- g(x) / exp(1-x) hat[1] <- mean(fg) se[1] <- sd(fg) x <- rgamma(n,2,2)+1 fg <- g(x) / (4*(x-1)*exp(-2*(x-1))) hat[2] <- mean(fg) se[2] <- sd(fg) rbind(hat, se)
第一个密度函数为exp(1-x) 第二个密度函数为gamma(2,2)向右平移一个单位 (平移原因是原题对f(x)的定义域有要求)
第一个密度函数的方差更小(经过多次运行可得)
5.14
n <- 5000 g <- function(x){ x^2 * exp(-x^2/2) / sqrt(2*pi) * (x > 1) } x <- rexp(n, 1)+1 fg <- g(x) / exp(1-x) hat <- mean(fg) se <- sd(fg) print(hat) print(se)
所选取f(x)为参数为1的指数分布,可以看出,重要性抽样的模拟效果良好
Exercises 6.5 and 6.A(page 180-181)
6.5
#设置一些必要参数 UCL <- numeric(1000) P <- numeric(2) n <- 20 alpha <- 0.05 # 正态分布的部分 for (i in 1:1000){ x <- rnorm(n, mean=0, sd=2) UCL[i] <- (n-1) * var(x) / qchisq(alpha, df=n-1) } P[1] <- mean(UCL > 4) # 卡方分布的部分 for (i in 1:1000){ x <- rchisq(n, df=2) UCL[i] <- (mean(x)-2)*sqrt(n)/sd(x) } P[2] <- mean(abs(UCL) < qt(1-alpha,n-1)) # 输出表格 d <- data.frame(P,row.names=c("norm","chisq")) knitr::kable(d)
6.A
# 卡方(1) n <- 20 alpha <- 0.05 mu0 <- 1 m <- 1000 p <- numeric(m) for (j in 1:m){ x <- rchisq(n, 1) ttest <- t.test(x, alternative = "two.sided", mu = mu0) p[j] <- ttest$p.value } p.hat <- mean(p < alpha) se.hat <- sqrt(p.hat * (1 - p.hat) / m) print(c(p.hat, se.hat)) # U(0,2) for (j in 1:m){ x <- runif(n, 0, 2) ttest <- t.test(x, alternative = "two.sided", mu = mu0) p[j] <- ttest$p.value } p.hat <- mean(p < alpha) se.hat <- sqrt(p.hat * (1 - p.hat) / m) print(c(p.hat, se.hat)) # 指数(1) for (j in 1:m){ x <- rexp(n, 1) ttest <- t.test(x, alternative = "two.sided", mu = mu0) p[j] <- ttest$p.value } p.hat <- mean(p < alpha) se.hat <- sqrt(p.hat * (1 - p.hat) / m) print(c(p.hat, se.hat))
讨论题:
(1)原假设:p1=p2 备择假设:p1!=p2 (2)在大样本情况下,使用配对样本的t检验(paired-test)/z-test。因为两种方法的样本是同一个,不独立,所以不可以使用two-sample t-test。而McNemar test是针对独立性的,验证两者是否相关,在此处不适用。 (3)需要知道样本个数n(已知),p的实际值(已知),p-value(未知)。所以知道p-value即p的理论值即可。
Exercises 6.C(pages 182, Statistical Computating with R)
6.C Repeat Examples 6.8 and 6.10 for Mardia’s multivariate skewness test. Mardia proposed tests of multivariate normality based on multivariate generalizations of skewness and kurtosis. If X and Y are iid, the multivariate population skewness $\beta_{1,d}$is defined by Mardia as $$\beta_{1,d}=E[(X-\mu)^T\Sigma^{-1}(Y-\mu)]^3$$ Under normality, $\beta_{1,d}=0$. The multivariate skewness statistic is $$b_{1,d}=\frac{1}{n^2}\sum_{i,j=1}^n((X_i-\bar{X})^T(\hat{\Sigma})^{-1}(X_j-\bar{X}))^3$$ where $Sigma$ is the maximum likelihood estimator of covariance. Large values of $b_{1,d}$ are significant. The asymptotic distribution of $\frac{nb_{1,d}}{6}$ is chisquared with $\frac{d(d+1)(d+2)}{6}$ degrees of freedom.
This is a multivariate skewness test of 6.8.We generate variables following $N(\mu,\Sigma)$,where: $$\mu=(0,0)^T,\Sigma=\begin{pmatrix} 1 & 0 \ 0 & 1 \end{pmatrix}$$
library(MASS) # 调用此包用于产生多元正态分布的随机数 func <- function(data){ n <- nrow(data) #数据的行数 c <- ncol(data) #数据的列数 central <- data #数据本身,以方便下做变换 for (i in 1:c){ central[,i] <- data[,i]-mean(data[,i]) #对原数据进行变体 } sigma <- t(central)%*%central/n #计算统计量1/4 a <- central%*%solve(sigma)%*%t(central) #计算统计量2/4 b <- sum(colSums(a^3))/(n^2) #计算统计量3/4 test <- n*b/6 #计算统计量4/4 chi <- qchisq(0.95, c*(c+1)*(c+2)/6) #边界值 as.integer(test>chi) #比较 } mu <- c(0,0) #均值向量 sigma <- matrix(c(1,0,0,1),nrow=2,ncol=2) #方差矩阵 m <- 100 #运行次数 n <- c(10,20,30,50,100,200,300,400,500) #不同的样本量 a <- numeric(length(n)) #长度为9 for (i in 1:length(n)){ a[i] = mean(replicate(m, expr={ data <- mvrnorm(n[i],mu,sigma) func(data) })) } print(a)
From the result we can see that when n>50,the a value is close to 0.05.
Next we estimate by simulation the power of the skewness test of normality against a contaminated normal (normal scale mixture) alternative,The contaminated normal distribution is denoted by $(1-\epsilon)N(\mu_1,\Sigma_1)+\epsilon N(\mu_2,\Sigma_2)$,where: $$\mu_1=\mu_2=(0,0)^T,\Sigma_1=\begin{pmatrix} 1 & 0 \ 0 & 1 \end{pmatrix},\Sigma_2=\begin{pmatrix} 100 & 0 \ 0 & 100 \end{pmatrix}$$
mu1 <- mu2 <- c(0,0) sigma1 <- matrix(c(1,0,0,1),nrow=2,ncol=2) sigma2 <- matrix(c(100,0,0,100),nrow=2,ncol=2) sigma <- list(sigma1,sigma2) m = 100 n = 100 epsilon <- c(seq(0,0.1,0.01),seq(0.1,1,0.05)) N <- length(epsilon) pwr <- numeric(N) for (j in 1:N){ e <- epsilon[j] sktests <- numeric(m) for (i in 1:m){ index=sample(c(1,2),replace=TRUE,size=n,prob=c(1-e,e)) data <- matrix(0,nrow=n,ncol=2) for (t in 1:n){ if(index[t]==1) data[t,]=mvrnorm(1,mu1,sigma1) else data[t,]=mvrnorm(1,mu2,sigma2) } sktests[i] <- func(data) } pwr[j] <- mean(sktests) } plot(epsilon,pwr,type = 'b', xlab = bquote(epsilon), ylim = c(0,1)) abline(h = 0.01, lty = 3) se <- sqrt(pwr * (1 - pwr) / m) lines(epsilon, pwr+se, lty = 3) lines(epsilon, pwr-se, lty = 3)
If $\epsilon$=0/1,this is similar with the first situation.When $0<=\epsilon<=1$, the pwr is greater than 0.05.
Exercises 7.7,7.8,7.9 and 7.B(pages 213, Statistical Computating with R)
7.7 Refer to Exercise 7.6. Efron and Tibshirani discuss the following example.The five-dimensional scores data have a 5*5 covariance matrix $\Sigma$, with positive eigenvalues $\lambda_1>\lambda_2>\lambda_3>\lambda_4>\lambda_5$, In principal components analysis,$$\theta=\frac{\lambda_1}{\sum_{j=1}^{5}\lambda_j}$$ measures the proportion of variance explained by the first principal compo- nent. Let $\hat\lambda_1>\hat\lambda_2>\hat\lambda_3>\hat\lambda_4>\hat\lambda_5$ be the eigenvalues of $\hat\Sigma$, where $\hat\Sigma$ is the MLE of $\Sigma$. Compute the sample estimate $$\hat\theta=\frac{\hat\lambda_1}{\sum_{j=1}^{n}\lambda_j}$$ ofθ. Use bootstrap to estimate the bias and standard error ofˆθ.
library(bootstrap) n <- nrow(scor) lambda_hat <- eigen(cov(scor))$values theta_hat <- lambda_hat[1] / sum(lambda_hat) B <- 1000 theta <- c() for (b in 1:B){ scor1 <- sample(scor, size = n, replace = TRUE) lambda <- eigen(cov(scor1))$values theta[b] <- lambda[1] / sum(lambda) } bias_boot <- mean(theta) - theta_hat se_boot <- sd(theta) print(round(c('bias_boot'=bias_boot,'se_boot'=se_boot),4))
7.8 Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat\theta$
library(bootstrap) set.seed(123) n <- nrow(scor) lambda_hat <- eigen(cov(scor))$values theta_hat <- lambda_hat[1] / sum(lambda_hat) theta_j <- c() for (i in 1:n){ x <- scor[-i,] lambda <- eigen(cov(x))$values theta_j[i] <- lambda[1]/sum(lambda) } bias_jack <- (n-1)*(mean(theta_j) - theta_hat) se_jack <- (n-1)*sqrt(var(theta_j)/n) print(round(c('bias_jack'=bias_jack,'se_jack'=se_jack),4))
7.9 Refer to Exercise 7.7. Compute 95% percentile and BCa confidence intervals for $\hat\theta$
# 95% percentile confidence intervals library(boot) theta.boot <- function(dat,i){ lambda <- eigen(cov(dat[i,]))$values theta <- lambda[1] / sum(lambda) theta } dat <- scor boot.obj <- boot(dat, statistic = theta.boot, R = 1000) print(boot.obj) alpha <- c(0.025, 0.975) print(quantile(boot.obj$t, alpha, type=6))
# 95% BCa confidence intervals library(boot) theta.boot <- function(dat,i){ lambda <- eigen(cov(dat[i,]))$values theta <- lambda[1] / sum(lambda) theta } dat <- scor boot.obj <- boot(dat, statistic = theta.boot, R = 1000) print(boot.ci(boot.obj, type=c("bca","perc")))
percentile置信区间计算了两遍,两次结果相差不大
7.B Repeat Project 7.A for the sample skewness statistic. Compare the coverage rates for normal populations (skewness 0) andχ2(5) distributions (positive skewness)
library(bootstrap) library(boot) n <- 100 x <- rnorm(n) boot1 <- function(x,i){ xm <- mean(x[i]) m3 <- mean((x - xm)^3) m2 <- mean((x - xm)^2) skewness <- m3 / m2^1.5 } m <- 100 p1 <- c() p2 <- c() p3 <- c() for (i in 1:m){ x <- rnorm(n) boot.obj <- boot(x, statistic = boot1, R=1000) a <- boot.ci(boot.obj,conf=0.95,type=c('basic','norm','perc')) p1[i] <- (a$norm[2]<0)*(0<a$norm[3]) p2[i] <- (a$perc[4]<0)*(0<a$perc[5]) p3[i] <- (a$basic[4]<0)*(0<a$basic[5]) } mean(p1) mean(p2) mean(p3) y <- rchisq(n,5) boot2 <- function(y,i){ ym <- mean(y[i]) m3 <- mean((y - ym)^3) m2 <- mean((y - ym)^2) skewness <- m3 / m2^1.5 } skew <-sqrt(8/5) r1 <- c() r2 <- c() r3 <- c() for (i in 1:m){ y <- rchisq(n,5) boot.obj <- boot(y, statistic = boot2, R=1000) b <- boot.ci(boot.obj,conf=0.95,type=c('basic','norm','perc')) r1[i] <- (b$norm[2]<skew)*(skew<b$norm[3]) r2[i] <- (b$perc[4]<skew)*(skew<b$perc[5]) r3[i] <- (b$basic[4]<skew)*(skew<b$basic[5]) } mean(r1) mean(r2) mean(r3)
Exercise 8.2 (page 242, Statistical Computating with R).
8.2 Implement the bivariate Spearman rank correlation test for independence [255] as a permutation test. The Spearman rank correlation test statistic can be obtained from function cor with method = "spearman". Compare the achieved significance level of the permutation test with the p-value reported by cor.test on the same samples.
a <- c(12,54,13,45,32,2) b <- c(1,78,28,46,69,46) M <- 500 abc <- c(a, b) gs <- 1:12 n<-length(a) len <- numeric(M) t0 <- (cor.test(a, b, method='spearman'))$estimate for (i in 1:M) { gs <- sample(gs, size = n, replace = FALSE) a1 <- abc[gs] b1 <- abc[-gs] len[i] <- (cor.test(a1, b1,method='spearman'))$estimate } p <- mean(abs(c(t0, len)) >= abs(t0)) round(c(p,(cor.test(a, b, method='spearman'))$p.value),3)
Design experiments for evaluating the performance of the NN, energy, and ball methods in various situations. Unequal variances and equal expectations Unequal variances and unequal expectations Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions) Unbalanced samples (say, 1 case versus 10 controls) Note: The parameters should be chosen such that the powers are distinguishable (say, range from 0.3 to 0.8).
Unequal variances and equal expectations(Multiple normal distribution) distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ distribution2: expectation:$\mu_2=(0,0)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 3 & 0 \ 0 & 3 \end{array}\right)$$
library(RANN) library(boot) library(Ball) library(energy) library(MASS) Function1 <- function(z, ix, sizes,k) { n1 <- sizes[1] n2 <- sizes[2] n <- n1 + n2 z <- z[ix, ] NN <- nn2(data=z, k=k+1) x1 <- NN$nn.idx[1:n1,] x2 <- NN$nn.idx[(n1+1):n,] i1 <- sum(x1 < n1 + .5) i2 <- sum(x2 > n1+.5) (i1 + i2) / (k * n) } test.nn <- function(z,sizes,k){ boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k) vec <- c(boot.obj$t0,boot.obj$t) p <- mean(vec >= vec[1]) list(statistic=vec[1],p.value=p) } set.seed(1111) u11 <- c(0,0) sigma11 <- matrix(c(1,0,0,1),nrow=2,ncol=2) u12 <- c(0,0) sigma12 <- matrix(c(3,0,0,3),nrow=2,ncol=2) n1 <- 28 n2 <- 28 n <- n1+n2 N <- c(n1,n2) k <- 2 R <- 99 m <- 88 alpha <- 0.05 pvalue <- matrix(NA,m,3) for(i in 1:m){ sj1 <- mvrnorm(n1,u11,sigma11) sj2 <- mvrnorm(n2,u12,sigma12) sj <- rbind(sj1,sj2) pvalue[i,1] <- test.nn(sj,N,k)$p.value pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value } a1 <- mean(pvalue[,1]<alpha) a2 <- mean(pvalue[,2]<alpha) a3 <- mean(pvalue[,3]<alpha) print(c(a1,a2,a3))
Unequal variances and unequal expectations
distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ distribution2: expectation:$\mu_2=(1,1)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 4 & 0 \ 0 & 4 \end{array}\right)$$
library(RANN) library(boot) library(Ball) library(energy) library(MASS) Function1 <- function(z, ix, sizes,k) { n1 <- sizes[1] n2 <- sizes[2] n <- n1 + n2 z <- z[ix, ] NN <- nn2(data=z, k=k+1) x1 <- NN$nn.idx[1:n1,] x2 <- NN$nn.idx[(n1+1):n,] i1 <- sum(x1 < n1 + .5) i2 <- sum(x2 > n1+.5) (i1 + i2) / (k * n) } test.nn <- function(z,sizes,k){ boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k) vec <- c(boot.obj$t0,boot.obj$t) p <- mean(vec >= vec[1]) list(statistic=vec[1],p.value=p) } u21 <- c(0,0) sigma21 <- matrix(c(1,0,0,1),nrow=2,ncol=2) u22 <- c(1,1) sigma22 <- matrix(c(4,0,0,4),nrow=2,ncol=2) n1 <- 28 n2 <- 28 n <- n1+n2 N <- c(n1,n2) k <- 2 R <- 99 m <- 88 alpha <- 0.05 pvalue <- matrix(NA,m,3) for(i in 1:m){ sj1 <- mvrnorm(n1,u21,sigma21) sj2 <- mvrnorm(n2,u22,sigma22) sj <- rbind(sj1,sj2) pvalue[i,1] <- test.nn(sj,N,k)$p.value pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value } a1 <- mean(pvalue[,1]<alpha) a2 <- mean(pvalue[,2]<alpha) a3 <- mean(pvalue[,3]<alpha) print(c(a1,a2,a3))
Non-normal distributions: t distribution with 1 df (heavy-tailed distribution)bimodel distribution (mixture of two normal distributions) $$t(1,3)$$ $$t(1,6)$$
library(RANN) library(boot) library(Ball) library(energy) library(MASS) Function1 <- function(z, ix, sizes,k) { n1 <- sizes[1] n2 <- sizes[2] n <- n1 + n2 z <- z[ix, ] NN <- nn2(data=z, k=k+1) x1 <- NN$nn.idx[1:n1,] x2 <- NN$nn.idx[(n1+1):n,] i1 <- sum(x1 < n1 + .5) i2 <- sum(x2 > n1+.5) (i1 + i2) / (k * n) } test.nn <- function(z,sizes,k){ boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k) vec <- c(boot.obj$t0,boot.obj$t) p <- mean(vec >= vec[1]) list(statistic=vec[1],p.value=p) } n1 <- 28 n2 <- 28 n <- n1+n2 N <- c(n1,n2) k <- 2 R <- 99 m <- 88 alpha <- 0.05 p_value <- matrix(NA,m,3) for(i in 1:m){ sj1 <- as.matrix(rt(n1,1,3),ncol=1) sj2 <- as.matrix(rt(n2,1,6),ncol=1) sj <- rbind(sj1,sj2) p_value[i,1] <- test.nn(sj,N,k)$p.value p_value[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value p_value[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value } a1 <- mean(p_value[,1]<alpha) a2 <- mean(p_value[,2]<alpha) a3 <- mean(p_value[,3]<alpha) print(c(a1,a2,a3))
bimodel distribution (mixture of two normal distributions) x1 ~ distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ x2 ~ distribution2: expectation:$\mu_2=(2,5)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 8 & 0 \ 0 & 2 \end{array}\right)$$ $$sj_1 = 0.3x_1 + 0.7x_2$$ x3 ~ distribution1: expectation:$\mu_1=(1,4)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 2 & 0 \ 0 & 3 \end{array}\right)$$ x4 ~ distribution2: expectation:$\mu_2=(3,7)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 4 & 0 \ 0 & 6 \end{array}\right)$$ $$sj_2 = 0.4x_3 + 0.6x_4$$
library(RANN) library(boot) library(Ball) library(energy) library(MASS) Function1 <- function(z, ix, sizes,k) { n1 <- sizes[1] n2 <- sizes[2] n <- n1 + n2 z <- z[ix, ] NN <- nn2(data=z, k=k+1) x1 <- NN$nn.idx[1:n1,] x2 <- NN$nn.idx[(n1+1):n,] i1 <- sum(x1 < n1 + .5) i2 <- sum(x2 > n1+.5) (i1 + i2) / (k * n) } test.nn <- function(z,sizes,k){ boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k) vec <- c(boot.obj$t0,boot.obj$t) p <- mean(vec >= vec[1]) list(statistic=vec[1],p.value=p) } n41 <- 18 n42 <- 18 n43 <- 28 n44 <- 28 n1 <- n41 n2 <- n43 n <- n1+n2 N <- c(n1,n2) k <- 2 R <- 99 m <- 88 alpha <- 0.05 pvalue <- matrix(NA,m,3) u31 <- c(0,0) sigma31 <- matrix(c(1,0,0,1),nrow=2,ncol=2) u32 <- c(2,5) sigma32 <- matrix(c(8,0,0,2),nrow=2,ncol=2) u33 <- c(1,4) sigma33 <- matrix(c(2,0,0,3),nrow=2,ncol=2) u34 <- c(3,7) sigma34 <- matrix(c(4,0,0,6),nrow=2,ncol=2) for(i in 1:m){ x1 <- mvrnorm(n41,u31,sigma31) x2 <- mvrnorm(n42,u32,sigma32) x3 <- mvrnorm(n43,u33,sigma33) x4 <- mvrnorm(n44,u34,sigma34) sj1 <- 0.3*x1 + 0.7*x2 sj2 <- 0.4*x3 + 0.6*x4 sj <- rbind(sj1,sj2) pvalue[i,1] <- test.nn(sj,N,k)$p.value pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value } a1 <- mean(pvalue[,1]<alpha) a2 <- mean(pvalue[,2]<alpha) a3 <- mean(pvalue[,3]<alpha) print(c(a1,a2,a3))
Unbalanced samples (say, 1 case versus 10 controls) distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ distribution2: expectation:$\mu_2=(0,0)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 3 & 0 \ 0 & 3 \end{array}\right)$$ with $n_1=38,n_2=28$
library(RANN) library(boot) library(Ball) library(energy) library(MASS) Function1 <- function(z, ix, sizes,k) { n1 <- sizes[1] n2 <- sizes[2] n <- n1 + n2 z <- z[ix, ] NN <- nn2(data=z, k=k+1) x1 <- NN$nn.idx[1:n1,] x2 <- NN$nn.idx[(n1+1):n,] i1 <- sum(x1 < n1 + .5) i2 <- sum(x2 > n1+.5) (i1 + i2) / (k * n) } test.nn <- function(z,sizes,k){ boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k) vec <- c(boot.obj$t0,boot.obj$t) p <- mean(vec >= vec[1]) list(statistic=vec[1],p.value=p) } u41 <- c(0,0) sigma41 <- matrix(c(1,0,0,1),nrow=2,ncol=2) u42 <- c(0,0) sigma42 <- matrix(c(3,0,0,3),nrow=2,ncol=2) n1 <- 38 n2 <- 28 n <- n1+n2 N <- c(n1,n2) k <- 2 R <- 99 m <- 88 alpha <- 0.05 pvalue <- matrix(NA,m,3) for(i in 1:m){ sj1 <- mvrnorm(n1,u41,sigma41) sj2 <- mvrnorm(n2,u42,sigma42) sj <- rbind(sj1,sj2) pvalue[i,1] <- test.nn(sj,N,k)$p.value pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value } a1 <- mean(pvalue[,1]<alpha) a2 <- mean(pvalue[,2]<alpha) a3 <- mean(pvalue[,3]<alpha) print(c(a1,a2,a3))
Use the Metropolis-Hastings sampler to generate random variables from a
standard Cauchy distribution. Discard the first 1000 of the chain, and compare
the deciles of the generated observations with the deciles of the standard
Cauchy distribution (see qcauchy or qt with df=1). Recall that a Cauchy(θ, η)
distribution has density function.
$$f(x)=\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},-\infty
set.seed(1122) f <- function(x,theta,eta){ return (1 / (theta*pi*(1+((x-eta)/theta)^2))) } m <- 5000 theta <- 1 eta <- 0 x <- numeric(m) x[1] <- rnorm(1,0,1) k <- 0 u <- runif(m) for (i in 2:m){ xt <- x[i-1] y <- rnorm(1,xt,1) l <- f(y,theta,eta)*dnorm(xt,y,1) d <- f(xt,theta,eta)*dnorm(y,xt,1) if (u[i] < l/d) { x[i] <- y } else{ x[i]<- xt k <- k+1 } } print(k) b <- 1001 y <- x[b:m] a <- ppoints(20) QR <- qcauchy(a) Q <- quantile(x,a) par(mfrow=c(1,2)) qqplot(QR,Q,main="",xlab="Cauchy Quantiles",ylab="Sample Quantiles") abline(0,1) hist(y,breaks="scott",main="",xlab="",freq=FALSE) lines(QR,f(QR,theta,eta))
This example appears in [40]. Consider the bivariate density
It can be shown (see e.g. [23]) that for fixed a, b, n, the conditional distributions are Binomial(n, y) and Beta(x + a, n − x + b). Use the Gibbs sampler to generate a chain with target joint density f(x, y).
N <- 500 burn <- 100 a <- 2 b <- 1 n <- 15 X <- matrix(0,N,2) X[1,] <- c(0,1) for (i in 2:N) { x2 <- X[i-1,2] X[i,1] <- rbinom(1, n, x2) x1 <- X[i,1] canshu1 <- x1+a canshu2 <- n-x1+b X[i,2] <- rbeta(1,canshu1,canshu2) } b <- burn+1 x <- X[b:N, ] shuchu <- colMeans(x) print(shuchu)
Gelman.Rubin <- function(psi) { psi <- as.matrix(psi) n <- ncol(psi) k <- nrow(psi) psi.means <- rowMeans(psi) B <- n * var(psi.means) psi.w <- apply(psi,1,"var") W <- mean(psi.w) v.hat <- W*(n-1)/n+(B/n) r.hat <- v.hat / W return(r.hat) } #关于原函数 a <- 2 b <- 1 n <- 15 fxy <- function(n1,n2,N){ X <- matrix(0,N,2) X[,1] <- n1 X[,2] <- n2 for (i in 2:N) { x2 <- X[i-1,2] X[i,1] <- rbinom(1, n, x2) x1 <- X[i,1] canshu1 <- x1+a canshu2 <- n-x1+b X[i,2] <- rbeta(1,canshu1,canshu2) } return(X) } n <- 1500 k <- 3 n1 <- c(2,4,6) n2 <- c(1,2,3) b <- 100 mu1 <- matrix(0,k,n) mu2 <- matrix(0,k,n) for (i in 1:k){ h <- fxy(n1[i],n2[i],n) mu1[i,] <- h[,1] mu2[i,] <- h[,2] } psimu1 <- t(apply(mu1,1,cumsum)) psimu2 <- t(apply(mu1,1,cumsum))
(a)
fk <- function(k,d=2,x,y){ return ((-1)^k * (x^2+y^2)^(k+1) * gamma((d+1)/2) * gamma(k + 1.5) / factorial(k) / 2^k / (2 * k + 1) / (2 * k+2) / gamma(k+d/2+1)) }
(b)(c)
sum1 = 0 for (i in 0:99){ sum1 = sum1 + fk(i,d=2, 1, 2) } print(sum1) sum2 = 0 for (i in 0:119){ sum2 = sum2 + fk(i,d=2, 1, 2) } print(sum2)
前100项与前120项所得相同,说明其和已经收敛,故a=(1,2),d=2时,和为1.5321
S <- function(a,k){ ck <- sqrt(a^2*k/(k+1-a^2)) pt(ck,df=k,lower.tail=FALSE) } f = function(a,k){S(a,k)-S(a,k-1)} solve = function(k){ output = uniroot(function(a){S(a,k)-S(a,k-1)},lower=0.1,upper=2) output$root } solve(100) solve(200) solve(300) solve(500) solve(1000) solve(5000)
随着k的增大,a的值稳定在1.73附近,与11.4所得相同
y <- c(0.21, 0.33, 0.43, 0.48, 0.54, 0.85, 0.91, 1, 1, 1) EM <- function(y, max.it=1000, eps=1e-5){ lambda = 0.5 i <- 1 lambda1 = 1 lambda2 = 0.5 while (abs(lambda1-lambda2) >= eps){ lambda1 <- lambda2 lambda2 <- (sum(y) + 3 * lambda2) / 10 if (i == max.it) break i <- i + 1 } return(lambda2) } EM(y,max.it=1000,eps=1e-5)
trims <- c(0, 0.1, 0.2, 0.5) x <- rcauchy(100) # 在源代码的基础上以向量矩阵输出 unlist(lapply(trims, function(trim) mean(x, trim = trim))) unlist(lapply(trims, mean, x = x))
Because the second form just calls mean
directly with the value of its x argument assigned by lapply
and the next argument (which is trim
receiving the value of the current iteration element.
attach(mtcars) formulas = list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt ) rsquared <- function(mod) summary(mod)$r.squared #loops func_loop = vector("list", length(formulas)) for (i in seq_along(formulas)){ func_loop[[i]] = lm(formulas[[i]], data = mtcars) } unlist(lapply(func_loop,rsquared)) # lapply func_apply <- lapply(formulas, function(x) lm(formula = x, data = mtcars)) unlist(lapply(func_apply,rsquared))
计算数字数据框中每列的标准差
x <- data.frame( seq = c(1:10), math = rnorm(10,80,3), Statistic = rnorm(10,90,1) ) vapply(x, FUN = sd,FUN.VALUE = 0)
计算混合数据框中每列的标准差
y <- data.frame( seq = c(1:3), name = c('张三','李四','王五'), math = rnorm(3,80,3), Statistic = rnorm(3,90,1) ) vapply(y[vapply(y, FUN = is.numeric, FUN.VALUE = logical(1))], FUN = sd, FUN.VALUE = numeric(1))
mcsapply <- function(x, f, ...) { shuchu <- mclapply(f, ...) simplify2array(shuchu) } mcvapply <- function(x, f, FUN.VALUE, ...) { shuchu_list <- mclapply(f, ...) out <- matrix(rep(FUN.VALUE, length(x)), nrow = length(x)) for (i in seq_along(x)) { stopifnot( length(res) == length(f.value), typeof(res) == typeof(f.value)) out[i, ] <- shuchu_list[[i]] } out }
mcvapply也是可以生成的,构造如上。
library(microbenchmark) # 用C++写 library(Rcpp) set.seed(5115) sourceCpp(code=' #include <Rcpp.h> using namespace Rcpp; //[[Rcpp::export]] NumericMatrix method_1(int n, int a, int b, int N){ NumericMatrix shuchu(N, 2); double x1=1; double x2=0.66; double canshu1=1; double canshu2=1; for(int i=0; i < N; i++){ x1 = rbinom(1, n, x2)[0]; canshu1 = x1 + a; canshu2 = n - x1 + b; x2 = rbeta(1, canshu1, canshu2)[0]; shuchu(i,0) = x1; shuchu(i,1) = x2; } return(shuchu); } ') # 用R写 method_2 <- function(n, a, b, N){ X <- matrix(0,N,2) X[1,] <- c(0,1) for (i in 2:N) { x2 <- X[i-1,2] X[i,1] <- rbinom(1, n, x2) x1 <- X[i,1] canshu1 <- x1+a canshu2 <- n-x1+b X[i,2] <- rbeta(1,canshu1,canshu2) } return(X) } #比较 N <- 5000 burn <- 1000 a <- 2 b <- 4 n <- 25 b <- burn+1 hanshu1 <- method_1(n,a,b,N)[b:N,] hanshu2 <- method_2(n,a,b,N)[b:N,] qqplot(hanshu1[,2], hanshu2[,2],xlab='r',ylab='c++') abline(0, 1, lwd=2, col='red') COMP = summary(microbenchmark(hanshu1,hanshu2)) knitr::kable(COMP)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.