HW0 2021-09-16

Question

Use knitr to produce at least 3 examples(texts,figures,tables)

Answer

example1 生成10个服从U(0,1)的随机数

x = rnorm(10)
print(x)

example2 生成U(0,1),U(0,2)和U(0,0.5)的概率密度函数

x <- seq(-5,5,length.out=100)
plot(x,dnorm(x,0,0.5),type='l')
lines(x,dnorm(x,0,1))
lines(x,dnorm(x,0,2))

example3

num <- c(1:3)
name <- c('张三','李四','王五')
sex <- c('女','男','女')
x <- data.frame(num,name,sex);x

HW1 2021-09-23

3.4

n <- 500
u <- runif(n)
sigma <- runif(1, 0, 1000)
x <- sqrt(-2 * sigma^2 * log(1 - u))
hist(x, prob = TRUE, main=expression("f(x)==x/(sigma^2)*exp(-x^2/(2*(sigma^2)))"))
y <- seq(0, 10000, 0.01)
lines(y, y/sigma^2*exp(-y^2/(2*sigma^2)))

3.11 有双峰,当r=0.5左右应该最为明显

n <- 10000
X1 <- rnorm(n, 0, 1)
X2 <- rnorm(n, 3, 1)
r <- sample(c(0, 1), n, replace = TRUE)
Z <- r * X1 + (1 - r) * X2
hist(Z)
p1 <- 0.75
p2 <- 0.1
p3 <- 0.2
p4 <- 0.3
p5 <- 0.4
p6 <- 0.5
p7 <- 0.6
p8 <- 0.7
p9 <- 0.8
p10 <- 0.9
Z1 <- p1 * X1 + (1 - p1) * X2
Z2 <- p2 * X1 + (1 - p2) * X2
Z3 <- p3 * X1 + (1 - p3) * X2
Z4 <- p4 * X1 + (1 - p4) * X2
Z5 <- p5 * X1 + (1 - p5) * X2
Z6 <- p6 * X1 + (1 - p6) * X2
Z7 <- p7 * X1 + (1 - p7) * X2
Z8 <- p8 * X1 + (1 - p8) * X2
Z9 <- p9 * X1 + (1 - p9) * X2
Z10 <- p10 * X1 + (1 - p10) * X2
hist(Z1); hist(Z2); hist(Z3); hist(Z4); hist(Z5); hist(Z6); hist(Z7); hist(Z8); hist(Z9); hist(Z10)

3.20 不妨设泊松分布中拉姆达为2,伽马分布的参数为(1,2) 第一个思路: (经计算可得,x(10)的均值和方差均为1)

n <- 1000
p <- rpois(n, 2)
x <- rgamma(n, 1*p, 2)
# 以上为随机生成n个复合分布的数字

#下计算x(10)
q <- rpois(n, 10*2)
y <- rgamma(n, q*1, 2)
print(mean(x))
print(var(x))

第二个思路: 共生成n个数,对于每一个数,均为i(i服从泊松分布)个服从伽马分布的数相加得到(只是一些思路)

n <- 1000
y <- c()
for (j in (1:1000)){
    x <- rgamma(rpois(1,2),1,2)
    #print(j)
    y[j] <- sum(x)
    #print(sum(x))
}
print(mean(y))
print(var(y))

HW2 2021-09-30

Question

Exercises 5.4 5.9 5.13 and 5.14(Statistical Computating with R)

Answer

5.4

fx <- function(i){
  m <- 5000
  x <- runif(m,0,i)
  theta <- mean(30*x^2*(1-x)^2*i)
  print(c(i,theta,pbeta(i,3,3)))
}
for (j in seq(0.1,0.9,0.1)){
  fx(j)
}

通过结果可以看出,蒙特卡洛估计十分接近真实值

5.9

fx <- function(x,r=1000,antithetic=TRUE){
  u <- runif(r/2)
  if (! antithetic) v <- runif(r/2) else
    v <- 1-u
  u <- c(u,v)
  cdf <- numeric(length(x))
  for (i in 1:length(x)){
    g <- x[i]^2*u*exp(-(x[i]*u)^2/2)
    cdf[i] <- mean(g)
  }
  cdf
}

x <- seq(1,100,20)
set.seed(123)
fx1 <- fx(x,antithetic=FALSE)
set.seed(123)
fx2 <- fx(x,antithetic=TRUE)
print(round(rbind(x,fx1,fx2),5))

此题选取sigma为1,故g(x)在代码中直接体现带入sigma的方程式。fx1为真实值,fx2为模拟值,选择(1,21,41,61,81),可以看出模拟效果十分良好

5.13

n <- 5000
g <- function(x){
  x^2 * exp(-x^2/2) / sqrt(2*pi) * (x > 1)
}
hat <- se <- numeric(2)


x <- rexp(n, 1)+1
fg <- g(x) / exp(1-x)
hat[1] <- mean(fg)
se[1] <- sd(fg)


x <- rgamma(n,2,2)+1
fg <- g(x) / (4*(x-1)*exp(-2*(x-1)))
hat[2] <- mean(fg)
se[2] <- sd(fg)



rbind(hat, se)

第一个密度函数为exp(1-x) 第二个密度函数为gamma(2,2)向右平移一个单位 (平移原因是原题对f(x)的定义域有要求)

第一个密度函数的方差更小(经过多次运行可得)

5.14

n <- 5000
g <- function(x){
  x^2 * exp(-x^2/2) / sqrt(2*pi) * (x > 1)
}
x <- rexp(n, 1)+1
fg <- g(x) / exp(1-x)
hat <- mean(fg)
se <- sd(fg)
print(hat)
print(se)

所选取f(x)为参数为1的指数分布,可以看出,重要性抽样的模拟效果良好

HW3 2021-10-14

Question

Exercises 6.5 and 6.A(page 180-181)

Answer

6.5

#设置一些必要参数
UCL <- numeric(1000)
P <- numeric(2)
n <- 20
alpha <- 0.05

# 正态分布的部分
for (i in 1:1000){
  x <- rnorm(n, mean=0, sd=2)
  UCL[i] <- (n-1) * var(x) / qchisq(alpha, df=n-1)
}
P[1] <- mean(UCL > 4)

# 卡方分布的部分
for (i in 1:1000){
  x <- rchisq(n, df=2)
  UCL[i] <- (mean(x)-2)*sqrt(n)/sd(x)
}
P[2] <- mean(abs(UCL) < qt(1-alpha,n-1))

# 输出表格
d <- data.frame(P,row.names=c("norm","chisq"))
knitr::kable(d)

6.A

# 卡方(1)
n <- 20
alpha <- 0.05
mu0 <- 1

m <- 1000
p <- numeric(m)

for (j in 1:m){
  x <- rchisq(n, 1)
  ttest <- t.test(x, alternative = "two.sided", mu = mu0)
  p[j] <- ttest$p.value
}
p.hat <- mean(p < alpha)
se.hat <- sqrt(p.hat * (1 - p.hat) / m)
print(c(p.hat, se.hat))


# U(0,2)
for (j in 1:m){
  x <- runif(n, 0, 2)
  ttest <- t.test(x, alternative = "two.sided", mu = mu0)
  p[j] <- ttest$p.value
}
p.hat <- mean(p < alpha)
se.hat <- sqrt(p.hat * (1 - p.hat) / m)
print(c(p.hat, se.hat))

# 指数(1)
for (j in 1:m){
  x <- rexp(n, 1)
  ttest <- t.test(x, alternative = "two.sided", mu = mu0)
  p[j] <- ttest$p.value
}
p.hat <- mean(p < alpha)
se.hat <- sqrt(p.hat * (1 - p.hat) / m)
print(c(p.hat, se.hat))

讨论题:

(1)原假设:p1=p2 备择假设:p1!=p2 (2)在大样本情况下,使用配对样本的t检验(paired-test)/z-test。因为两种方法的样本是同一个,不独立,所以不可以使用two-sample t-test。而McNemar test是针对独立性的,验证两者是否相关,在此处不适用。 (3)需要知道样本个数n(已知),p的实际值(已知),p-value(未知)。所以知道p-value即p的理论值即可。

HW4 2021-10-21

Question

Exercises 6.C(pages 182, Statistical Computating with R)

6.C Repeat Examples 6.8 and 6.10 for Mardia’s multivariate skewness test. Mardia proposed tests of multivariate normality based on multivariate generalizations of skewness and kurtosis. If X and Y are iid, the multivariate population skewness $\beta_{1,d}$is defined by Mardia as $$\beta_{1,d}=E[(X-\mu)^T\Sigma^{-1}(Y-\mu)]^3$$ Under normality, $\beta_{1,d}=0$. The multivariate skewness statistic is $$b_{1,d}=\frac{1}{n^2}\sum_{i,j=1}^n((X_i-\bar{X})^T(\hat{\Sigma})^{-1}(X_j-\bar{X}))^3$$ where $Sigma$ is the maximum likelihood estimator of covariance. Large values of $b_{1,d}$ are significant. The asymptotic distribution of $\frac{nb_{1,d}}{6}$ is chisquared with $\frac{d(d+1)(d+2)}{6}$ degrees of freedom.

Answer

This is a multivariate skewness test of 6.8.We generate variables following $N(\mu,\Sigma)$,where: $$\mu=(0,0)^T,\Sigma=\begin{pmatrix} 1 & 0 \ 0 & 1 \end{pmatrix}$$

library(MASS) # 调用此包用于产生多元正态分布的随机数
func <- function(data){
  n <- nrow(data) #数据的行数
  c <- ncol(data) #数据的列数
  central <- data #数据本身,以方便下做变换
  for (i in 1:c){
    central[,i] <- data[,i]-mean(data[,i]) #对原数据进行变体
  }
  sigma <- t(central)%*%central/n #计算统计量1/4
  a <- central%*%solve(sigma)%*%t(central) #计算统计量2/4
  b <- sum(colSums(a^3))/(n^2) #计算统计量3/4
  test <- n*b/6 #计算统计量4/4
  chi <- qchisq(0.95, c*(c+1)*(c+2)/6) #边界值
  as.integer(test>chi) #比较
}


mu <- c(0,0) #均值向量
sigma <- matrix(c(1,0,0,1),nrow=2,ncol=2) #方差矩阵
m <- 100 #运行次数
n <- c(10,20,30,50,100,200,300,400,500) #不同的样本量
a <- numeric(length(n)) #长度为9
for (i in 1:length(n)){
  a[i] = mean(replicate(m, expr={
    data <- mvrnorm(n[i],mu,sigma)
    func(data)
  }))
}
print(a)

From the result we can see that when n>50,the a value is close to 0.05.

Next we estimate by simulation the power of the skewness test of normality against a contaminated normal (normal scale mixture) alternative,The contaminated normal distribution is denoted by $(1-\epsilon)N(\mu_1,\Sigma_1)+\epsilon N(\mu_2,\Sigma_2)$,where: $$\mu_1=\mu_2=(0,0)^T,\Sigma_1=\begin{pmatrix} 1 & 0 \ 0 & 1 \end{pmatrix},\Sigma_2=\begin{pmatrix} 100 & 0 \ 0 & 100 \end{pmatrix}$$

mu1 <- mu2 <- c(0,0)
sigma1 <- matrix(c(1,0,0,1),nrow=2,ncol=2)
sigma2 <- matrix(c(100,0,0,100),nrow=2,ncol=2)
sigma <- list(sigma1,sigma2)
m = 100
n = 100
epsilon <- c(seq(0,0.1,0.01),seq(0.1,1,0.05))
N <- length(epsilon)
pwr <- numeric(N)
for (j in 1:N){
  e <- epsilon[j]
  sktests <- numeric(m)
  for (i in 1:m){
    index=sample(c(1,2),replace=TRUE,size=n,prob=c(1-e,e))
    data <- matrix(0,nrow=n,ncol=2)
    for (t in 1:n){
      if(index[t]==1) data[t,]=mvrnorm(1,mu1,sigma1)
      else data[t,]=mvrnorm(1,mu2,sigma2)
    }
    sktests[i] <- func(data)
  }
  pwr[j] <- mean(sktests)
}
plot(epsilon,pwr,type = 'b', xlab = bquote(epsilon), ylim = c(0,1))
abline(h = 0.01, lty = 3)
se <- sqrt(pwr * (1 - pwr) / m)
lines(epsilon, pwr+se, lty = 3)
lines(epsilon, pwr-se, lty = 3)

If $\epsilon$=0/1,this is similar with the first situation.When $0<=\epsilon<=1$, the pwr is greater than 0.05.

HW5 2021-10-28

Question

Exercises 7.7,7.8,7.9 and 7.B(pages 213, Statistical Computating with R)

Answer

7.7 Refer to Exercise 7.6. Efron and Tibshirani discuss the following example.The five-dimensional scores data have a 5*5 covariance matrix $\Sigma$, with positive eigenvalues $\lambda_1>\lambda_2>\lambda_3>\lambda_4>\lambda_5$, In principal components analysis,$$\theta=\frac{\lambda_1}{\sum_{j=1}^{5}\lambda_j}$$ measures the proportion of variance explained by the first principal compo- nent. Let $\hat\lambda_1>\hat\lambda_2>\hat\lambda_3>\hat\lambda_4>\hat\lambda_5$ be the eigenvalues of $\hat\Sigma$, where $\hat\Sigma$ is the MLE of $\Sigma$. Compute the sample estimate $$\hat\theta=\frac{\hat\lambda_1}{\sum_{j=1}^{n}\lambda_j}$$ ofθ. Use bootstrap to estimate the bias and standard error ofˆθ.

library(bootstrap)
n <- nrow(scor)
lambda_hat <- eigen(cov(scor))$values
theta_hat <- lambda_hat[1] / sum(lambda_hat)
B <- 1000
theta <- c()

for (b in 1:B){
  scor1 <- sample(scor, size = n, replace = TRUE)
  lambda <- eigen(cov(scor1))$values
  theta[b] <- lambda[1] / sum(lambda)
}
bias_boot <- mean(theta) - theta_hat
se_boot <- sd(theta)
print(round(c('bias_boot'=bias_boot,'se_boot'=se_boot),4))

7.8 Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat\theta$

library(bootstrap)
set.seed(123)
n <- nrow(scor)
lambda_hat <- eigen(cov(scor))$values
theta_hat <- lambda_hat[1] / sum(lambda_hat)
theta_j <- c()

for (i in 1:n){
  x <- scor[-i,]
  lambda <- eigen(cov(x))$values
  theta_j[i] <- lambda[1]/sum(lambda)

}
bias_jack <- (n-1)*(mean(theta_j) - theta_hat)
se_jack <- (n-1)*sqrt(var(theta_j)/n)

print(round(c('bias_jack'=bias_jack,'se_jack'=se_jack),4))

7.9 Refer to Exercise 7.7. Compute 95% percentile and BCa confidence intervals for $\hat\theta$

# 95% percentile confidence intervals
library(boot)
theta.boot <- function(dat,i){
  lambda <- eigen(cov(dat[i,]))$values
  theta <- lambda[1] / sum(lambda)
  theta
}
dat <- scor
boot.obj <- boot(dat, statistic = theta.boot, R = 1000)
print(boot.obj)
alpha <- c(0.025, 0.975)
print(quantile(boot.obj$t, alpha, type=6))
# 95% BCa confidence intervals
library(boot)
theta.boot <- function(dat,i){
  lambda <- eigen(cov(dat[i,]))$values
  theta <- lambda[1] / sum(lambda)
  theta
}
dat <- scor
boot.obj <- boot(dat, statistic = theta.boot, R = 1000)
print(boot.ci(boot.obj, type=c("bca","perc")))

percentile置信区间计算了两遍,两次结果相差不大

7.B Repeat Project 7.A for the sample skewness statistic. Compare the coverage rates for normal populations (skewness 0) andχ2(5) distributions (positive skewness)

library(bootstrap)
library(boot)
n <- 100

x <- rnorm(n)
boot1 <- function(x,i){
  xm <- mean(x[i])
  m3 <- mean((x - xm)^3)
  m2 <- mean((x - xm)^2)
  skewness <- m3 / m2^1.5
}

m <- 100
p1 <- c()
p2 <- c()
p3 <- c()

for (i in 1:m){
  x <- rnorm(n)
  boot.obj <- boot(x, statistic = boot1, R=1000)
  a <- boot.ci(boot.obj,conf=0.95,type=c('basic','norm','perc'))
  p1[i] <- (a$norm[2]<0)*(0<a$norm[3])
  p2[i] <- (a$perc[4]<0)*(0<a$perc[5])
  p3[i] <- (a$basic[4]<0)*(0<a$basic[5])
}
mean(p1)
mean(p2)
mean(p3)

y <- rchisq(n,5)
boot2 <- function(y,i){
  ym <- mean(y[i])
  m3 <- mean((y - ym)^3)
  m2 <- mean((y - ym)^2)
  skewness <- m3 / m2^1.5
}
skew <-sqrt(8/5)
r1 <- c()
r2 <- c()
r3 <- c()
for (i in 1:m){
  y <- rchisq(n,5)
  boot.obj <- boot(y, statistic = boot2, R=1000)
  b <- boot.ci(boot.obj,conf=0.95,type=c('basic','norm','perc'))
  r1[i] <- (b$norm[2]<skew)*(skew<b$norm[3])
  r2[i] <- (b$perc[4]<skew)*(skew<b$perc[5])
  r3[i] <- (b$basic[4]<skew)*(skew<b$basic[5])
}
mean(r1)
mean(r2)
mean(r3)

HW6 2021-11-04

Question 1

Exercise 8.2 (page 242, Statistical Computating with R).

8.2 Implement the bivariate Spearman rank correlation test for independence [255] as a permutation test. The Spearman rank correlation test statistic can be obtained from function cor with method = "spearman". Compare the achieved significance level of the permutation test with the p-value reported by cor.test on the same samples.

Answer 1

a <- c(12,54,13,45,32,2)
b <- c(1,78,28,46,69,46)
M <- 500
abc <- c(a, b)
gs <- 1:12
n<-length(a)
len <- numeric(M)
t0 <- (cor.test(a, b, method='spearman'))$estimate

for (i in 1:M) {
  gs <- sample(gs, size = n, replace = FALSE)
  a1 <- abc[gs]
  b1 <- abc[-gs]
  len[i] <- (cor.test(a1, b1,method='spearman'))$estimate
}

p <- mean(abs(c(t0, len)) >= abs(t0))
round(c(p,(cor.test(a, b, method='spearman'))$p.value),3)

Question 2

Design experiments for evaluating the performance of the NN, energy, and ball methods in various situations. Unequal variances and equal expectations Unequal variances and unequal expectations Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions) Unbalanced samples (say, 1 case versus 10 controls) Note: The parameters should be chosen such that the powers are distinguishable (say, range from 0.3 to 0.8).

Answer 2.1

Unequal variances and equal expectations(Multiple normal distribution) distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ distribution2: expectation:$\mu_2=(0,0)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 3 & 0 \ 0 & 3 \end{array}\right)$$

library(RANN)
library(boot)
library(Ball)
library(energy)
library(MASS)

Function1 <- function(z, ix, sizes,k) {
  n1 <- sizes[1]
  n2 <- sizes[2]
  n <- n1 + n2
  z <- z[ix, ]
  NN <- nn2(data=z, k=k+1)

  x1 <- NN$nn.idx[1:n1,]
  x2 <- NN$nn.idx[(n1+1):n,]
  i1 <- sum(x1 < n1 + .5)
  i2 <- sum(x2 > n1+.5)
  (i1 + i2) / (k * n)
}

test.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k)
  vec <- c(boot.obj$t0,boot.obj$t)
  p <- mean(vec >= vec[1])
  list(statistic=vec[1],p.value=p)
}

set.seed(1111)
u11 <- c(0,0)
sigma11 <- matrix(c(1,0,0,1),nrow=2,ncol=2)
u12 <- c(0,0)
sigma12 <- matrix(c(3,0,0,3),nrow=2,ncol=2)
n1 <- 28
n2 <- 28
n <- n1+n2 
N <- c(n1,n2)
k <- 2
R <- 99
m <- 88
alpha <- 0.05
pvalue <- matrix(NA,m,3)

for(i in 1:m){
  sj1 <- mvrnorm(n1,u11,sigma11)
  sj2 <- mvrnorm(n2,u12,sigma12)
  sj <- rbind(sj1,sj2)
  pvalue[i,1] <- test.nn(sj,N,k)$p.value
  pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value
  pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value
}
a1 <- mean(pvalue[,1]<alpha)
a2 <- mean(pvalue[,2]<alpha)
a3 <- mean(pvalue[,3]<alpha)
print(c(a1,a2,a3))

Answer 2.2

Unequal variances and unequal expectations

distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ distribution2: expectation:$\mu_2=(1,1)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 4 & 0 \ 0 & 4 \end{array}\right)$$

library(RANN)
library(boot)
library(Ball)
library(energy)
library(MASS)

Function1 <- function(z, ix, sizes,k) {
  n1 <- sizes[1]
  n2 <- sizes[2]
  n <- n1 + n2
  z <- z[ix, ]
  NN <- nn2(data=z, k=k+1)

  x1 <- NN$nn.idx[1:n1,]
  x2 <- NN$nn.idx[(n1+1):n,]
  i1 <- sum(x1 < n1 + .5)
  i2 <- sum(x2 > n1+.5)
  (i1 + i2) / (k * n)
}

test.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k)
  vec <- c(boot.obj$t0,boot.obj$t)
  p <- mean(vec >= vec[1])
  list(statistic=vec[1],p.value=p)
}

u21 <- c(0,0)
sigma21 <- matrix(c(1,0,0,1),nrow=2,ncol=2)
u22 <- c(1,1)
sigma22 <- matrix(c(4,0,0,4),nrow=2,ncol=2)
n1 <- 28
n2 <- 28
n <- n1+n2 
N <- c(n1,n2)
k <- 2
R <- 99
m <- 88
alpha <- 0.05
pvalue <- matrix(NA,m,3)

for(i in 1:m){
  sj1 <- mvrnorm(n1,u21,sigma21)
  sj2 <- mvrnorm(n2,u22,sigma22)
  sj <- rbind(sj1,sj2)
  pvalue[i,1] <- test.nn(sj,N,k)$p.value
  pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value
  pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value
}
a1 <- mean(pvalue[,1]<alpha)
a2 <- mean(pvalue[,2]<alpha)
a3 <- mean(pvalue[,3]<alpha)
print(c(a1,a2,a3))

Answer 2.3

Non-normal distributions: t distribution with 1 df (heavy-tailed distribution)bimodel distribution (mixture of two normal distributions) $$t(1,3)$$ $$t(1,6)$$

library(RANN)
library(boot)
library(Ball)
library(energy)
library(MASS)

Function1 <- function(z, ix, sizes,k) {
  n1 <- sizes[1]
  n2 <- sizes[2]
  n <- n1 + n2
  z <- z[ix, ]
  NN <- nn2(data=z, k=k+1)

  x1 <- NN$nn.idx[1:n1,]
  x2 <- NN$nn.idx[(n1+1):n,]
  i1 <- sum(x1 < n1 + .5)
  i2 <- sum(x2 > n1+.5)
  (i1 + i2) / (k * n)
}

test.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k)
  vec <- c(boot.obj$t0,boot.obj$t)
  p <- mean(vec >= vec[1])
  list(statistic=vec[1],p.value=p)
}


n1 <- 28
n2 <- 28
n <- n1+n2 
N <- c(n1,n2)
k <- 2
R <- 99
m <- 88
alpha <- 0.05

p_value <- matrix(NA,m,3)
for(i in 1:m){
  sj1 <- as.matrix(rt(n1,1,3),ncol=1)
  sj2 <- as.matrix(rt(n2,1,6),ncol=1)
  sj <- rbind(sj1,sj2)
  p_value[i,1] <- test.nn(sj,N,k)$p.value
  p_value[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value
  p_value[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value
}
a1 <- mean(p_value[,1]<alpha)
a2 <- mean(p_value[,2]<alpha)
a3 <- mean(p_value[,3]<alpha)
print(c(a1,a2,a3))

bimodel distribution (mixture of two normal distributions) x1 ~ distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ x2 ~ distribution2: expectation:$\mu_2=(2,5)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 8 & 0 \ 0 & 2 \end{array}\right)$$ $$sj_1 = 0.3x_1 + 0.7x_2$$ x3 ~ distribution1: expectation:$\mu_1=(1,4)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 2 & 0 \ 0 & 3 \end{array}\right)$$ x4 ~ distribution2: expectation:$\mu_2=(3,7)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 4 & 0 \ 0 & 6 \end{array}\right)$$ $$sj_2 = 0.4x_3 + 0.6x_4$$

library(RANN)
library(boot)
library(Ball)
library(energy)
library(MASS)

Function1 <- function(z, ix, sizes,k) {
  n1 <- sizes[1]
  n2 <- sizes[2]
  n <- n1 + n2
  z <- z[ix, ]
  NN <- nn2(data=z, k=k+1)

  x1 <- NN$nn.idx[1:n1,]
  x2 <- NN$nn.idx[(n1+1):n,]
  i1 <- sum(x1 < n1 + .5)
  i2 <- sum(x2 > n1+.5)
  (i1 + i2) / (k * n)
}

test.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k)
  vec <- c(boot.obj$t0,boot.obj$t)
  p <- mean(vec >= vec[1])
  list(statistic=vec[1],p.value=p)
}

n41 <- 18
n42 <- 18
n43 <- 28
n44 <- 28
n1 <- n41
n2 <- n43
n <- n1+n2
N <- c(n1,n2)
k <- 2
R <- 99
m <- 88
alpha <- 0.05
pvalue <- matrix(NA,m,3)

u31 <- c(0,0)
sigma31 <- matrix(c(1,0,0,1),nrow=2,ncol=2)
u32 <- c(2,5)
sigma32 <- matrix(c(8,0,0,2),nrow=2,ncol=2)
u33 <- c(1,4)
sigma33 <- matrix(c(2,0,0,3),nrow=2,ncol=2)
u34 <- c(3,7)
sigma34 <- matrix(c(4,0,0,6),nrow=2,ncol=2)

for(i in 1:m){
  x1 <- mvrnorm(n41,u31,sigma31)
  x2 <- mvrnorm(n42,u32,sigma32)
  x3 <- mvrnorm(n43,u33,sigma33)
  x4 <- mvrnorm(n44,u34,sigma34)
  sj1 <- 0.3*x1 + 0.7*x2
  sj2 <- 0.4*x3 + 0.6*x4
  sj <- rbind(sj1,sj2)
  pvalue[i,1] <- test.nn(sj,N,k)$p.value
  pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value
  pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value
}
a1 <- mean(pvalue[,1]<alpha)
a2 <- mean(pvalue[,2]<alpha)
a3 <- mean(pvalue[,3]<alpha)
print(c(a1,a2,a3))

Answer 2.4

Unbalanced samples (say, 1 case versus 10 controls) distribution1: expectation:$\mu_1=(0,0)$ covariance:$$\Sigma_{1}=\left(\begin{array}{ccc} 1 & 0 \ 0 & 1 \end{array}\right)$$ distribution2: expectation:$\mu_2=(0,0)$ covariance:$$\Sigma_{2}=\left(\begin{array}{ccc} 3 & 0 \ 0 & 3 \end{array}\right)$$ with $n_1=38,n_2=28$

library(RANN)
library(boot)
library(Ball)
library(energy)
library(MASS)

Function1 <- function(z, ix, sizes,k) {
  n1 <- sizes[1]
  n2 <- sizes[2]
  n <- n1 + n2
  z <- z[ix, ]
  NN <- nn2(data=z, k=k+1)

  x1 <- NN$nn.idx[1:n1,]
  x2 <- NN$nn.idx[(n1+1):n,]
  i1 <- sum(x1 < n1 + .5)
  i2 <- sum(x2 > n1+.5)
  (i1 + i2) / (k * n)
}

test.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z, statistic=Function1, R=R, sim = "permutation", sizes=sizes, k=k)
  vec <- c(boot.obj$t0,boot.obj$t)
  p <- mean(vec >= vec[1])
  list(statistic=vec[1],p.value=p)
}

u41 <- c(0,0)
sigma41 <- matrix(c(1,0,0,1),nrow=2,ncol=2)
u42 <- c(0,0)
sigma42 <- matrix(c(3,0,0,3),nrow=2,ncol=2)
n1 <- 38
n2 <- 28
n <- n1+n2 
N <- c(n1,n2)
k <- 2
R <- 99
m <- 88
alpha <- 0.05
pvalue <- matrix(NA,m,3)

for(i in 1:m){
  sj1 <- mvrnorm(n1,u41,sigma41)
  sj2 <- mvrnorm(n2,u42,sigma42)
  sj <- rbind(sj1,sj2)
  pvalue[i,1] <- test.nn(sj,N,k)$p.value
  pvalue[i,2] <- eqdist.etest(sj,sizes=N,R=R)$p.value
  pvalue[i,3] <- bd.test(x=sj1,y=sj2,num.permutations=R)$p.value
}
a1 <- mean(pvalue[,1]<alpha)
a2 <- mean(pvalue[,2]<alpha)
a3 <- mean(pvalue[,3]<alpha)
print(c(a1,a2,a3))

HW7 2021-11-11

Question 9.3

Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchy distribution (see qcauchy or qt with df=1). Recall that a Cauchy(θ, η) distribution has density function. $$f(x)=\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},-\infty0$$ The standard Cauchy has the Cauchy(θ = 1, η = 0) density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)

Answer 9.3

set.seed(1122)
f <- function(x,theta,eta){
  return (1 / (theta*pi*(1+((x-eta)/theta)^2)))
}

m <- 5000
theta <- 1
eta <- 0
x <- numeric(m)
x[1] <- rnorm(1,0,1)
k <- 0
u <- runif(m)

for (i in 2:m){
  xt <- x[i-1]
  y <- rnorm(1,xt,1)
  l <- f(y,theta,eta)*dnorm(xt,y,1)
  d <- f(xt,theta,eta)*dnorm(y,xt,1)
  if (u[i] < l/d) {
    x[i] <- y
    } else{
    x[i]<- xt
    k <- k+1
  }
}
print(k)

b <- 1001
y <- x[b:m]
a <- ppoints(20)
QR <- qcauchy(a)
Q <- quantile(x,a)

par(mfrow=c(1,2))
qqplot(QR,Q,main="",xlab="Cauchy Quantiles",ylab="Sample Quantiles")
abline(0,1)
hist(y,breaks="scott",main="",xlab="",freq=FALSE)
lines(QR,f(QR,theta,eta))

Question 9.8

This example appears in [40]. Consider the bivariate density

It can be shown (see e.g. [23]) that for fixed a, b, n, the conditional distributions are Binomial(n, y) and Beta(x + a, n − x + b). Use the Gibbs sampler to generate a chain with target joint density f(x, y).

Answer 9.8

N <- 500
burn <- 100
a <- 2
b <- 1
n <- 15

X <- matrix(0,N,2)
X[1,] <- c(0,1)

for (i in 2:N) {
  x2 <- X[i-1,2]
  X[i,1] <- rbinom(1, n, x2)
  x1 <- X[i,1]
  canshu1 <- x1+a
  canshu2 <- n-x1+b
  X[i,2] <- rbeta(1,canshu1,canshu2)
}

b <- burn+1
x <- X[b:N, ]
shuchu <- colMeans(x)
print(shuchu)

Answer 3-9.8

Gelman.Rubin <- function(psi) {
  psi <- as.matrix(psi)
  n <- ncol(psi)
  k <- nrow(psi)
  psi.means <- rowMeans(psi)
  B <- n * var(psi.means)
  psi.w <- apply(psi,1,"var")
  W <- mean(psi.w)
  v.hat <- W*(n-1)/n+(B/n)
  r.hat <- v.hat / W
  return(r.hat)
}

#关于原函数
a <- 2
b <- 1
n <- 15

fxy <- function(n1,n2,N){
  X <- matrix(0,N,2)
  X[,1] <- n1
  X[,2] <- n2
  for (i in 2:N) {
    x2 <- X[i-1,2]
    X[i,1] <- rbinom(1, n, x2)
    x1 <- X[i,1]
    canshu1 <- x1+a
    canshu2 <- n-x1+b
    X[i,2] <- rbeta(1,canshu1,canshu2)
  }
  return(X)
}

n <- 1500
k <- 3
n1 <- c(2,4,6)
n2 <- c(1,2,3)
b <- 100
mu1 <- matrix(0,k,n)
mu2 <- matrix(0,k,n)

for (i in 1:k){
  h <- fxy(n1[i],n2[i],n)
  mu1[i,] <- h[,1]
  mu2[i,] <- h[,2]
}

psimu1 <- t(apply(mu1,1,cumsum))
psimu2 <- t(apply(mu1,1,cumsum))

HW8 2021-11-18

Question 11.3 11.5 3

Answer 11.3

(a)

fk <- function(k,d=2,x,y){
  return ((-1)^k * (x^2+y^2)^(k+1) * gamma((d+1)/2) * gamma(k + 1.5) / factorial(k) / 2^k / (2 * k + 1) / (2 * k+2) / gamma(k+d/2+1))
}

(b)(c)

sum1 = 0
for (i in 0:99){
  sum1 = sum1 + fk(i,d=2, 1, 2)
}
print(sum1)

sum2 = 0
for (i in 0:119){
  sum2 = sum2 + fk(i,d=2, 1, 2)
}
print(sum2)

前100项与前120项所得相同,说明其和已经收敛,故a=(1,2),d=2时,和为1.5321

Answer 11.5

S <- function(a,k){
  ck <- sqrt(a^2*k/(k+1-a^2))
  pt(ck,df=k,lower.tail=FALSE)
}
f = function(a,k){S(a,k)-S(a,k-1)}

solve = function(k){
  output = uniroot(function(a){S(a,k)-S(a,k-1)},lower=0.1,upper=2)
  output$root
}
solve(100)
solve(200)
solve(300)
solve(500)
solve(1000)
solve(5000)

随着k的增大,a的值稳定在1.73附近,与11.4所得相同

Answer 3

y <- c(0.21, 0.33, 0.43, 0.48, 0.54, 0.85, 0.91, 1, 1, 1)
EM <- function(y, max.it=1000, eps=1e-5){
  lambda = 0.5
  i <- 1
  lambda1 = 1
  lambda2 = 0.5
  while (abs(lambda1-lambda2) >= eps){
    lambda1 <- lambda2
    lambda2 <- (sum(y) + 3 * lambda2) / 10
    if (i == max.it) break
    i <- i + 1
  }
  return(lambda2)
}
EM(y,max.it=1000,eps=1e-5)

HW9 2021-11-25

page 204-1

trims <- c(0, 0.1, 0.2, 0.5)
x <- rcauchy(100)
# 在源代码的基础上以向量矩阵输出
unlist(lapply(trims, function(trim) mean(x, trim = trim)))
unlist(lapply(trims, mean, x = x))

Because the second form just calls mean directly with the value of its x argument assigned by lapply and the next argument (which is trim receiving the value of the current iteration element.

page 204-5

attach(mtcars)

formulas = list(
  mpg ~ disp,
  mpg ~ I(1 / disp),
  mpg ~ disp + wt,
  mpg ~ I(1 / disp) + wt
)
rsquared <- function(mod) summary(mod)$r.squared

#loops
func_loop = vector("list", length(formulas))
for (i in seq_along(formulas)){
  func_loop[[i]] = lm(formulas[[i]], data = mtcars)
}
unlist(lapply(func_loop,rsquared))

# lapply
func_apply <- lapply(formulas, function(x) lm(formula = x, data = mtcars))
unlist(lapply(func_apply,rsquared))

page 214-1

计算数字数据框中每列的标准差

x <- data.frame(
  seq = c(1:10),
  math = rnorm(10,80,3),
  Statistic = rnorm(10,90,1)
)
vapply(x, FUN = sd,FUN.VALUE = 0)

计算混合数据框中每列的标准差

y <- data.frame(
  seq = c(1:3),
  name = c('张三','李四','王五'),
  math = rnorm(3,80,3),
  Statistic = rnorm(3,90,1)
)
vapply(y[vapply(y, FUN = is.numeric, FUN.VALUE = logical(1))], FUN = sd, FUN.VALUE = numeric(1))

page 214-7

mcsapply <- function(x, f, ...) {
  shuchu <- mclapply(f, ...)
  simplify2array(shuchu)
}

mcvapply <- function(x, f, FUN.VALUE, ...) {
  shuchu_list <- mclapply(f, ...)
  out <- matrix(rep(FUN.VALUE, length(x)), nrow = length(x))
  for (i in seq_along(x)) {
    stopifnot(
      length(res) == length(f.value),
      typeof(res) == typeof(f.value))
    out[i, ] <- shuchu_list[[i]]
  }
  out
}

mcvapply也是可以生成的,构造如上。

HW10 2021-12-02

Answer

library(microbenchmark)

# 用C++写
library(Rcpp)
set.seed(5115)

sourceCpp(code='
#include <Rcpp.h>
using namespace Rcpp;

//[[Rcpp::export]]
  NumericMatrix method_1(int n, int a, int b, int N){
    NumericMatrix shuchu(N, 2);
    double x1=1;
    double x2=0.66;
    double canshu1=1;
    double canshu2=1;
    for(int i=0; i < N; i++){
      x1 = rbinom(1, n, x2)[0];
      canshu1 = x1 + a;
      canshu2 = n - x1 + b;
      x2 = rbeta(1, canshu1, canshu2)[0];
      shuchu(i,0) = x1;
      shuchu(i,1) = x2;
    }
    return(shuchu);
  }
')

# 用R写
method_2 <- function(n, a, b, N){
  X <- matrix(0,N,2)
  X[1,] <- c(0,1)
  for (i in 2:N) {
    x2 <- X[i-1,2]
    X[i,1] <- rbinom(1, n, x2)
    x1 <- X[i,1]
    canshu1 <- x1+a
    canshu2 <- n-x1+b
    X[i,2] <- rbeta(1,canshu1,canshu2)
  }
  return(X)
}

#比较
N <- 5000
burn <- 1000
a <- 2
b <- 4
n <- 25
b <- burn+1

hanshu1 <- method_1(n,a,b,N)[b:N,]
hanshu2 <- method_2(n,a,b,N)[b:N,]

qqplot(hanshu1[,2], hanshu2[,2],xlab='r',ylab='c++')
abline(0, 1, lwd=2, col='red')

COMP = summary(microbenchmark(hanshu1,hanshu2))
knitr::kable(COMP)


MenggeYang/StatComp21032 documentation built on Dec. 23, 2021, 11:18 p.m.