Homework 1

Question 1

Generating a series of normal random numbers.

Answer

a=rnorm(10)
a

Question 2

Generate a matrix.

Answer

matrix(1:6, 2, 3)

Question 3

Draw a scatter plot.

Answer

x=rnorm(10)
y=rnorm(10)
plot(x,y)

Homework 2

3.5

Problem
A discrete random variable X has probability mass function
x 0 1 2 3 4
p(x) 0.1 0.2 0.2 0.2 0.3
Use the inverse transform method to generate a random sample of size 1000 from the distribution of X. Construct a relative frequency table and compare the empirical with the theoretical probabilities. Repeat using the R sample function.

Idea
Generate U~U(0, 1)
If FX (xi-1) < U$\leq$ FX (xi ), then X = xi .

Result

x <- 0:4; p <- c(0.1,0.2,0.2,0.2,0.3)
cp <- cumsum(p); m <- 1e3; r <- numeric(m)
r <- x[findInterval(runif(m),cp)+1]
ct <- as.vector(table(r)); ct/sum(ct) 

3.7:

Problem Write a function to generate a random sample of size n from the Beta(a, b) distribution by the acceptance-rejection method. Generate a random sample of size 1000 from the Beta(3,2) distribution. Graph the histogram of the sample with the theoretical Beta(3,2) density superimposed.

Idea Generate random numbers U~U(0, 1) and Y~g(??).
If U$\leq$(Y ), then accept Y and stop (return X = Y );
otherwise reject Y and continue.
In this problem, we choose g(x) =1, 0 < x < 1 and c = $\frac{48}{27}$.

Result

n <- 1e3;j<-k<-0;y <- numeric(n)
while (k < n) {
u <- runif(1)
j <- j + 1
x <- runif(1) #random variate from g
if (27/4*x^2 * (1-x) > u) {
#we accept x
k <- k + 1
y[k] <- x
}
}
hist(y,breaks=20,freq=F)
f<- function(x)  return(12*x^2*(1-x))
curve(f,add=T)

3.12:

Problem
Simulate a continuous Exponential-Gamma mixture. Suppose that the rate parameter $??$ has $Gamma(r, ??)$ distribution and $Y$ has $Exp(??)$ distribution. That is, $(Y |?? = ??)$ ~ $f_Y (y|??)$ = $??e^{-??y}$. Generate 1000 random observations from this mixture with r = 4 and ?? = 2.

Idea:
First generate 1000 random $??'s$ which has $Gamma(r, ??)$ distribution, then generate $Y's$ which has $Exp(??'s)$ distribution.

Result

n <- 1e3; r <- 4; beta <- 3
lambda <- rgamma(n, shape=r, scale=beta)
y <- rexp(n, lambda)
hist(y,breaks=20,freq=F)

Homework 3

5.4

Problem statement Write a function to compute a Monte Carlo estimate of the Beta(3, 3) cdf, and use the function to estimate F(x) for x = 0.1, 0.2, . . ., 0.9. Compare the estimates with the values returned by the pbeta function in R.

Idea If $a$ and $b$ are finite, then $$\int_a^b h(x)dx = \int_a^b g(x)\frac1{b-a}dx=E[g(X)],$$ where $X\sim U(0,1),g(x)=(b-a)h(x).$
Use a frequency to approximation the expectation (Strong Law of Large Number): $$\frac1m \sum_{i=1}^m g(X_i),$$ where $X_1,\ldots,X_m$ are iid copies of $X$ and $m$ is a sufficiently large integer.

Result

Pbeta<-function(y){
if(y<=0) value=0
if(y>1) value=1
if(y>=0&y<1){
m <- 1e4; x <- runif(m, min=0, max=y)
value <- mean(30*x^2*(1-x)^2)*(y-0)
}
value
}
sapply((1:9)/10,Pbeta)
pbeta1<-function(x){
  pbeta(x,3,3)
}
sapply((1:9)/10,pbeta1)

5.9:

Problem statement The Rayleigh density is $f(x) = \frac{x}{\sigma^2} e^{-x2/(2??^2)}, x ?? 0, ?? > 0$.
Implement a function to generate samples from a Rayleigh(??) distribution, using antithetic variables. What is the percent reduction in variance of $\frac{X+X'}{2}$ compared with $\frac{X_1+X_2}{2}$ for independent X1, X2?

Result

MC.Phi<-function(x,sigma,R= 10000, antithetic = TRUE) {
u <- runif(R/2)
if (!antithetic)
v<- runif(R/2)
else 
v<-1-u
w<-c(u, v)
cdf <- numeric(length(x))
for (i in 1:length(x)) {
g <-x[i]*w/(sigma)^2*exp((-(x[i]*w)^2/(2*sigma*sigma)))*x[i]
cdf[i] <- mean(g)
}
return(cdf)
}
x <- seq(.1,2.5,length=5);
pRayleigh<-function(x,sigma){
  s<-sigma;
  p<-numeric(length(x));
  intergrand<-function(x){
    x/(s^2)*exp((-x^2/(2*(s^2))))};
  for(i in 1:length(x)){
  p[i]<-integrate(intergrand,0,x[i])$value;
  }
  return(p)
}
Phi<-pRayleigh(x,sigma=2)
set.seed(123)
MC1<- MC.Phi(x,sigma=2,anti=FALSE) #for (X1+X2)/2 which X1,X2 is independent
set.seed(123)
MC2<- MC.Phi(x,sigma=2,anti=TRUE)  #for antithetic variables (X+X')/2
print(round(rbind(x, MC1, MC2, Phi),5))
m <- 1000
MC1 <- MC2 <- numeric(m)
x <- 1.95
for (i in 1:m) {
MC1[i] <- MC.Phi(x,2,R = 1000, anti = FALSE)
MC2[i] <- MC.Phi(x,2,R = 1000,anti=TRUE)
}
print(sd(MC1))
print(sd(MC2))
print((var(MC1) - var(MC2))/var(MC1))

The antithetic variable approach achieved approximately 91.9% reduction in variance at x = 1.95.and $\sigma=2$.

5.13:

Problem statement
Find two importance functions $f_1$ and $f_2$ that are supported on $(1,\infty)$ and are ??close?? to $$g(x) = \frac{x^2}{\sqrt{2 \pi}}e^{-x^2/2}, x > 1.$$ Which of your two importance functions should produce the smaller variance in estimating $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2 \pi}}e^{-x^2/2}dx$$ by importance sampling? Explain.
Idea:
$\theta=\int g(x)dx$ can be written as $$\int \frac{g(x)}{f(x)}f(x)dx=E(g(X)/f(X)),$$ where $X$ has pdf/pmf $f(\cdot)$.
$f(\cdot)$ is a $f(\cdot)$ is a pdf/pmf from which random samples are easy to generate, which is called the importance function of $g(\cdot)$.
$\theta$ can be estimated with $$\hat\theta=\frac1m\sum_{i=1}^m\frac{g(X_i)}{f(X_i)}$$
The variance of $\hat\theta$ is $var(g(X_1)/f(X_1))/m$, which has the minimal value 0 when $g(\cdot) = c f(\cdot)$ for some constant $c$.
We select $f_1=e^{-x+1}, x>1$ and $f_2=\frac{1}{2}{(x-1)}^2e^{-x+1}, x>1$.

Result

x <- seq(1,5,length.out = 20)
    g <- exp(-x^2/2)*x^2/sqrt(2*pi)
    f1 <- exp(-x+1)
    f2 <- 1/2*(x-1)^2 *exp(-x+1)
#figure (a)
plot(g~x,type = "l",col=1)
lines(f1~x,col=2)
lines(f2~x,col=3)
legend("topright", legend =c("g", "f1", "f2"),
           lty = 1:3, lwd = 2, inset = 0.02,col=1:3)
#figure (b)
plot(g/f1~x,type = "l",col=1)
lines(g/f2~x,col=2)
legend("topright", legend =c("f1", "f2"),
           lty = 1:2, lwd = 2, inset = 0.02,col=1:2)
m <- 10000
  theta.hat <- se <- numeric(2)
  g <- function(x) {
  exp(-x^2/2)*x^2/sqrt(2*pi) * (x > 1)
  }
x <- rexp(m, rate= 1)+1 #using f1
  fg <- g(x)/exp(-x+1)
  theta.hat[1] <- mean(fg)
  se[1] <- sd(fg)
x <- rgamma(m, shape=3, rate = 1)+1 #using f2
  fg <- g(x)/(1/2*(x-1)^2 *exp(-x+1))
  theta.hat[2] <- mean(fg)
  se[2] <- sd(fg)
  res <- rbind(theta=round(theta.hat,3), se=round(se,3))
  colnames(res) <- paste0('f',1:2)
  knitr::kable(res,align='c')

From the figure, we can see $f_1$ produces the smaller variance in estimating.

5.14:

Problem statement
Obtain a Monte Carlo estimate of $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2 \pi}}e^{-x^2/2}dx$$ by importance sampling.
Idea:
$\theta=\int g(x)dx$ can be written as $$\int \frac{g(x)}{f(x)}f(x)dx=E(g(X)/f(X)),$$ where $X$ has pdf/pmf $f(\cdot)$.
$f(\cdot)$ is a $f(\cdot)$ is a pdf/pmf from which random samples are easy to generate, which is called the importance function of $g(\cdot)$.
$\theta$ can be estimated with $$\hat\theta=\frac1m\sum_{i=1}^m\frac{g(X_i)}{f(X_i)}$$
The variance of $\hat\theta$ is $var(g(X_1)/f(X_1))/m$, which has the minimal value 0 when $g(\cdot) = c f(\cdot)$ for some constant $c$.
We select $f=e^{-x+1}, x>1$.

Result

m <- 10000
  theta.hat <- se <- numeric(1)
  g <- function(x) {
  exp(-x^2/2)*x^2/sqrt(2*pi) * (x > 1)
  }
x <- rexp(m, rate= 1)+1 #using f
  fg <- g(x)/exp(-x+1)
  theta.hat[1] <- mean(fg)
  se[1] <- sd(fg)
  res <- rbind(theta=round(theta.hat,3), se=round(se,3))
  colnames(res) <- 'f'
  knitr::kable(res,align='c')

Homework 4

6.9

Problem statement Let X be a non-negative random variable with (\mu=E[X]<\infty). For a random sample (x_1,\cdots,x_n) from the distribution of X, the Gini ratio is defined by [G=\frac{1}{2n^2\mu}\sum_{j=1}^{n}\sum_{i=1}^{n}|x_i-x_j|] The Gini ratio is applied in economics to measure inequality in income distribution . Note that G can be written in terms of the order statistics x(i) as [G=\frac{1}{n^2\mu}\sum_{i=1}^{n}(2i-n-1)x_{(i)}] If the mean is unknown, let (\hat{G}) be the statistic G with ?? replaced by (\bar{x}). Estimate by simulation the mean, median and deciles of (\hat{G}) if X is standard lognormal. Repeat the procedure for the uniform distribution and Bernoulli(0.1). Also construct density histograms of the replicates in each case.

Result

m <- 1000 # Number of Monte Carlo trials
n <- 100
set.seed(1)
G.hat1 <- numeric(m) # Storage for test statistics from the MC trials
g1 <- numeric(n)
Id <- 1:n
## Start the simulation
for (i in 1:m){
  x1 <- rlnorm(n) # x1 generated from standard lognormal distribution
  mu_hat1 <- mean(x1) # The estimation of mu
  x1_order <- sort(x1) # x1_order is the order statistic of x1
  G.hat1[i] <- (2 * Id -n-1)%*%x1_order/(n^2 *mu_hat1)# Estimate the value of G
}
print(c(mean(G.hat1),median(G.hat1))) # the mean and median of G.hat1
print(quantile(G.hat1,probs=seq(0.1,1,0.1))) # the deciles of G.hat1
hist(G.hat1,prob = TRUE)

m <- 1000 # Number of Monte Carlo trials
n <- 100
set.seed(12)
G.hat2 <- numeric(m) # Storage for test statistics from the MC trials
g2 <- numeric(n)
Id <- 1:n
## Start the simulation
for (i in 1:m){
  x2 <- runif(n) # x2 generated from uniform distribution
  mu_hat2 <- mean(x2) # The estimation of mu
  x2_order <- sort(x2) # x2_order is the order statistic of x2
  G.hat2[i] <- (2 * Id -n-1)%*%x2_order/(n^2 *mu_hat2)# Estimate the value of G
}
print(c(mean(G.hat2),median(G.hat2))) # the mean and median of G.hat2
print(quantile(G.hat2,probs=seq(0.1,1,0.1))) # the deciles of G.hat2
hist(G.hat2,prob = TRUE)

m <- 1000 # Number of Monte Carlo trials
n <- 100
set.seed(123)
G.hat3 <- numeric(m) # Storage for test statistics from the MC trials
g3 <- numeric(n)
Id <- 1:n
## Start the simulation
for (i in 1:m){
  x3 <- rbinom(n,1,0.1) # x3 generated from Bernoulli(0.1) distribution
  mu_hat3 <- mean(x3) # The estimation of mu
  x3_order <- sort(x3) # x3_order is the order statistic of x3
  G.hat3[i] <- (2 * Id -n-1)%*%x3_order/(n^2 *mu_hat3)# Estimate the value of G
}
print(c(mean(G.hat3),median(G.hat3))) # the mean and median of G.hat3
print(quantile(G.hat3,probs=seq(0.1,1,0.1))) # the deciles of G.hat3
hist(G.hat3,prob = TRUE)

6.10:

Problem statement Construct an approximate 95% confidence interval for the Gini ratio (\gamma=E[G]) if X is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.

Result

# function to calculate the confidence interval
compu.interval <- function(a,b){
m<-1e3
G<-numeric(m)
I<-2*c(1:m)-m-1
set.seed(123)
for(i in 1:m){
  x<-rlnorm(m,a,b) #generate random numbers
  x<-sort(x) #sorting x
  mu=mean(x)
  G[i]<-1/m^2/mu*(t(I)%*%x) #compute G
}
CI<-c(mean(G)-1.96*sd(G)/sqrt(m),mean(G)+1.96*sd(G)/sqrt(m))#compute confidence interval
return(CI)
}
#approximate Coverage probability(ECP) of confidence interval
N<-100
bar<-numeric(N)
k<-0
a <- 0
b <- 1
I<-2*c(1:m)-m-1
G.true<-numeric(m)
CI <- compu.interval(a,b)
set.seed(1234)
for(j in 1:N){
  for(i in 1:m){
    x<-rlnorm(m,0,1) 
    x<-sort(x) 
    mu<-mean(x)
    G.true[i]<-1/m^2/mu*(t(I)%*%x)
  }
  bar[j]<-mean(G.true)
  if(bar[j]>CI[1]&bar[j]<CI[2]){
    k<-k+1}
}
k/N

Homework 5

7.1

Problem statement
Exercises 7.1 Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2. LSAT 576 635 558 578 666 580 555 661 651 605 653 575 545 572 594 GPA 339 330 281 303 344 307 300 343 336 313 312 274 276 288 296

Result

set.seed(12345)
LSAT=c(576,635,558,578,666,580,555,661,651,605,653,575,545,572,594)
GPA=c(339,330,281,303,344,307,300,343,336,313,312,274,276,288,296)
x=cbind(LSAT,GPA)
n=15
b.cor <- function(x,i) cor(x[i,1],x[i,2])
theta.hat <- b.cor(x,1:n)
theta.jack <- numeric(n)
for(i in 1:n){
theta.jack[i] <- b.cor(x,(1:n)[-i])
}
bias.jack <- (n-1)*(mean(theta.jack)-theta.hat)
se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2))
bias.jack
se.jack

7.5:

Problem statement
Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures 1/?? by the standard normal, basic, percentile,and BCa methods. Compare the intervals and explain why they may differ. The 12 observations are the times in hours between failures of airconditioning equipment: 3, 5, 7, 18, 43, 85, 91, 98, 100, 130, 230, 487. Result

set.seed(12345)
library(boot)
x=c(3,5,7,18,43,85,91,98,100,130,230,487)
boot.mean=function(x,i) mean(x[i])
de=boot(data=x,statistic=boot.mean,R=1024)
ci=boot.ci(de,type=c("norm","basic","perc","bca"))
ci

7.8:

Problem statement
Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat \theta$.

Result

library(bootstrap)
data=scor
u=c(mean(data[,1]),mean(data[,2]),mean(data[,3]),mean(data[,4]),mean(data[,5]))
m=matrix(0,5,5)
for (i in 1:88) m=m+(as.numeric(data[i,])-u)%*%t(as.numeric(data[i,])-u)  
m=m/88   ##MLE of covariance matrix
lambda=eigen(m)$values  ## the eigenvalues
theta.hat=lambda[1]/sum(lambda)
theta.jack=numeric(5)
for (i in 1:5) theta.jack[i]=lambda[1]/sum(lambda[-i])
bias.jack <- (n-1)*(mean(theta.jack)-theta.hat)
se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2))
bias.jack
se.jack

7.11:

Problem statement In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.
Result

##leave-one-out (n-fold) cross validation
library(DAAG); attach(ironslag)
n <- length(magnetic) #in DAAG ironslag
e1 <- e2 <- e3 <- e4 <- numeric(n)
# for n-fold cross validation
# fit models on leave-one-out samples
for (k in 1:n) {
y <- magnetic[-k]
x <- chemical[-k]
J1 <- lm(y ~ x)
yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k]
e1[k] <- magnetic[k] - yhat1
J2 <- lm(y ~ x + I(x^2))
yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] +
J2$coef[3] * chemical[k]^2
e2[k] <- magnetic[k] - yhat2
J3 <- lm(log(y) ~ x)
logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k]
yhat3 <- exp(logyhat3)
e3[k] <- magnetic[k] - yhat3
J4 <- lm(log(y) ~ log(x))
logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k])
yhat4 <- exp(logyhat4)
e4[k] <- magnetic[k] - yhat4
}
c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))
##According to the prediction error criterion, Model 2, the quadratic model,
##would be the best fit for the data.
##leave-two-out cross validation
n <- length(magnetic) #in DAAG ironslag
e1 <- e2 <- e3 <- e4 <- numeric(n)
E1 <- E2 <- E3 <- E4 <-rep(0,2)
subscript<-t(combn(n,2))
for (k in 1:choose(n,2)) {
K<-subscript[k,]
y <- magnetic[-K]
x <- chemical[-K]
J1 <- lm(y ~ x)
yhat1 <- J1$coef[1] + J1$coef[2] * chemical[K]
E1<- magnetic[K] - yhat1
e1[k]<-sum(abs(E1))
J2 <- lm(y ~ x + I(x^2))
yhat2 <- J2$coef[1] + J2$coef[2] * chemical[K] +J2$coef[3] * chemical[K]^2
E2 <- magnetic[K] - yhat2
e2[k]<-sum(abs(E2))
J3 <- lm(log(y) ~ x)
logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[K]
yhat3 <- exp(logyhat3)
E3<- magnetic[K] - yhat3
e3[k]<-sum(abs(E3))
J4 <- lm(log(y) ~ log(x))
logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[K])
yhat4 <- exp(logyhat4)
E4<- magnetic[K] - yhat4
e4[k]<-sum(abs(E4))
}
c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))
##According to the prediction error criterion, Model 2, the quadratic model,
##would be the best fit for the data.

Homework 6

library(latticeExtra)
library(RANN)
library(energy)
library(Ball)
library(boot)
library(ggplot2)

8.1

Problem statement Implement the two-sample Cramer-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.

Result

##function:two-sample Cramer-von Mises test for equal distributions
cvm <- function(x,y,data){
  r <- 1000 #permutation samples
  reps <- numeric(r)
  n <- length(x)
  m <- length(y)
  v.n <- numeric(n)
  v1.n <- numeric(n)
  v.m <- numeric(m)
  v1.m <- numeric(m)
  z <- c(x,y)
  N <- length(z)
  Ix <- seq(1:n)
  Iy <- seq(1:m)
  v.n <- (x-Ix)**2
  v.m <- (y-Iy)**2
  #test statistic
  reps_0 <- ((n * sum(v.n)+m * sum(v.m))/(m * n * N))-(4 * m * n - 1)/(6 * N)
  for (k in 1:r){#permutation samples
    w <- sample(N,size=n,replace=FALSE)
    x1 <- sort(z[w])
    y1 <- sort(z[-w])
    v1.n <- (x1-Ix)**2
    v1.m <- (y1-Iy)**2
    reps[k] <- ((n * sum(v1.n)+m * sum(v1.m))/(m * n * N))-(4 * m * n - 1)/(6 * N)
  }
  p <- mean(c(reps_0,reps) >= reps_0)
  return(
    histogram(c(reps_0,reps),
              type="density",
              col="#0080ff",
              xlab="Replicates of Cramer-Von Mises statistic",
              ylab=list(rot=0),
              main=paste0("Data:",data),
              sub=list(substitute(paste(hat(p),"=",pvalue),list(pvalue=p)),col=2),
              panel=function(...){
                panel.histogram(...)
                panel.abline(v=reps_0,col=2,lwd=2)
              })
  )
}

##Data: Example 8.1
attach(chickwts)
x <- sort(as.vector(weight[feed == "soybean"]))
y <- sort(as.vector(weight[feed == "linseed"]))

cvm1 <- cvm(x,y,"Example 8.1")

##Data: Example 8.2
x <- sort(as.vector(weight[feed == "sunflower"]))
y <- sort(as.vector(weight[feed == "linseed"]))
detach(chickwts)

cvm2 <- cvm(x,y,"Example 8.2")

##Results
print(cvm1)
print(cvm2)

2:

Problem statement Design experiments for evaluating the performance of the NN, energy, and ball methods in various situations. (1)Unequal variances and equal expectations (2)Unequal variances and unequal expectations (3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodal distribution (mixture of two normal distributions) (4)Unbalanced samples (say, 1 case versus 10 controls) Result

## variable definition
m <- 500 #permutation samples
p<-2 # dimension of data
n1 <- n2 <- 50 #the sample size of x and y
R<-999 #boot parameter
k<-3 #boot parameter
n <- n1 + n2
N = c(n1,n2)
# the function of NN method
Tn <- function(z, ix, sizes,k){
  n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2
  if(is.vector(z)) z <- data.frame(z,0);
  z <- z[ix, ];
  NN <- nn2(data=z, k=k+1) 
  block1 <- NN$nn.idx[1:n1,-1]
  block2 <- NN$nn.idx[(n1+1):n,-1]
  i1 <- sum(block1 < n1 + .5)
  i2 <- sum(block2 > n1+.5)
  (i1 + i2) / (k * n)
}

eqdist.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", 
                 sizes = sizes,k=k)
  ts <- c(boot.obj$t0,boot.obj$t)
  p.value <- mean(ts>=ts[1])
  list(statistic=ts[1],p.value=p.value)
}
p.values <- matrix(NA,m,3) #pֵ

(1)Unequal variances and equal expectations

##(1)Unequal variances and equal expectations
set.seed(1)
sd <- 1.5
for(i in 1:m){
  x <- matrix(rnorm(n1*p),ncol=p)
  y <- matrix(rnorm(n2*p,sd=sd),ncol=p)
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value#NN method
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value#energy methods
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed=i*12345)$p.value# ball method
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

The power of ball method is better than NN and energy methods.

(2)Unequal variances and unequal expectations

##(2)Unequal variances and unequal expectations
set.seed(1)
mu <- 0.5
sd <- 1.5
for(i in 1:m){
  x <- matrix(rnorm(n1*p),ncol=p)
  y <- matrix(rnorm(n2*p,mean=mu,sd=sd),ncol=p)
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value#NN method
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value#energy methods
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed=i*12345)$p.value# ball method
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
pow
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

The power of ball method is better than NN and energy methods.

(3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodal distribution (mixture of two normal distributions)

##Non-normal distributions: t distribution with 1 df (heavy-tailed
##distribution), bimodal distribution (mixture of two normal
##distributions)
set.seed(1)
mu <- 0.5
sd <- 2
for(i in 1:m){
  x <- matrix(rt(n1*p,df=1),ncol=p)
  y1 = rnorm(n2*p);  y2 = rnorm(n2*p,mean=mu,sd=sd)
  w = rbinom(n, 1, .5) # 50:50 random choice
  y <- matrix(w*y1 + (1-w)*y2,ncol=p)# normal mixture
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value#NN method
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value#energy methods
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed=i*12345)$p.value# ball method
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
pow
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

The power of energy method is better than NN and ball methods.

(4)Unbalanced samples (say, 1 case versus 10 controls)

##Unbalanced samples 
set.seed(1)
mu <- 0.5
N = c(n1,n2*2)
for(i in 1:m){
  x <- matrix(rnorm(n1*p),ncol=p);
  y <- cbind(rnorm(n2*2),rnorm(n2*2,mean=mu));
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value#NN method
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value#energy methods
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed=i*12345)$p.value# ball method
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
pow
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

The power of energy method is better than NN and ball methods.

9.3:

Problem statement
Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchy distribution (see qcauchy or qt with df=1). Recall that a Cauchy $(\theta,\eta)$ distribution has density function $$f(x)=\frac{1}{\theta \pi (1+[(x-\eta)/\theta]^2)},\ \ -\infty < x <\infty,\theta>0$$ The standard Cauchy has the Cauchy($\theta=1,\eta=0$) density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)

Result

set.seed(1)
n <- 10000 #Sample size
x <- numeric(n)
u <- runif(n)
theta=1
eta=0

x[1] <- rnorm(1)
k <- 0

# cauchy functions
f <- function(x, theta=1, eta=0){
  out <- 1/(pi * theta * (1+((x-eta)/theta)^2))
  return(out)
}

for(i in 2:n){
  xt <- x[i-1]
  y <- rnorm(1,mean=xt)
  R <- f(y)*dnorm(xt,mean=y)/(f(xt)*dnorm(y,mean=xt))
  if(u[i] <= R){
    x[i] <- y
  }else{
    x[i] <- xt
    k <- k+1
  }
}

is <- 1001:n
par(mfrow=c(1,2))
plot(is,x[is],type="l")
hist(x[is], probability=TRUE,breaks=100)
plot.x <- seq(min(x[is]),max(x[is]),0.01)
lines(plot.x,f(plot.x))
par(mfrow=c(1,1))
#compare the deciles
observations <- quantile(x[is],seq(0,1,0.1))
expectations <- qcauchy(seq(0,1,0.1))
decile <- data.frame(observations,expectations)
decile

Homework 7

11.4:

Problem statement Find the intersection points A(k) in (0 ,$\sqrt k$) of the curves

$ S_{k-1}(a)=P \lgroup t(k-1) >\sqrt {\frac {a^2(k-1)}{k-a^2} \rgroup )$

and

$ S_{k}(a)=P \lgroup t(k) >\sqrt {\frac {a^2k}{k+1-a^2} } \rgroup $

354 Statistical Computing with R for k = 4 : 25 ,100,500,1000, where t(k) is a Student t random variable with k degrees of freedom. (These intersection points determine the critical values for a t-test for scale-mixture errors proposed by Sz??ekely [260].

Result

Sk_1 <- function(a,k){
  q <- sqrt(a^2*(k-1)/(k-a^2)) 
  return (1-pt(q,df=k-1))
}
Sk <- function(a,k){
  q <- sqrt(a^2*k/(k+1-a^2)) 
  return (1-pt(q,df=k))
}
difSK <- function(x,k) { 
  Sk_1(x,k)-Sk(x,k)
}
kset <- c(4:25,100,500,1000)
out <- 1:length(kset)
for (i in 1:length(kset)){
  out[i] <- uniroot( difSK
            , lower = 0+1e-5, upper = sqrt(kset[i])-1e-5,k=kset[i]) $root
}
out


kset[ abs(out-sqrt(kset)) < sqrt(kset)*0.01]

n <- 1:length(kset)
Kwrongnum <- n[abs(out-sqrt(kset)) < sqrt(kset)*0.01]


#Example : k=23
k=23
xx <- seq(0.01,sqrt(k)-1e-5,length=1000)
y <- difSK(xx,k)
plot(xx,y,type="l")

#Example : k=1000
k=1000
xx <- seq(0.01,sqrt(k)-1e-5,length=1000)
y <- difSK(xx,k)
plot(xx,y,type="l")

#change upper to 3

for (i in Kwrongnum){
  out[i] <- uniroot( difSK
                     , lower = 0+1e-5, upper =3,k=kset[i]) $root
}
names(out) <- kset

out

Homework 8

11.6

Problem statement Write a function to compute the cdf of the Cauchy distribution, which has density $$\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2),-\infty0$. Compare your results to the results from the R function pcauchy. (Also see the source code in pcauchy.c.)

Result

f<-function(y,theta,eta){

1/(theta*3.141592653*(1+((y-eta)/theta)^2))
}


pdf<-function(x,theta,eta,lower.tail=TRUE){
 if(lower.tail) res<-integrate(f,lower = -Inf,upper = x,rel.tol=.Machine$double.eps^0.25,theta=theta,eta=eta)
 else res<-integrate(f,lower = x,upper = Inf,rel.tol=.Machine$double.eps^0.25,theta=theta,eta=eta)
  return(res$value)
}
pdf(x=0,theta = 1,eta = 0)
pcauchy(0,location = 0,scale = 1)

pdf(x=2,theta = 2,eta =1,lower.tail = F )
pcauchy(2,location = 1,scale = 2,lower.tail = F)

: A-B-O blood type problem

Problem statement
Let the three alleles be A, B, and O.

        dat <- rbind(Genotype=c('AA','BB','OO','AO','BO','AB'),
                     Frequency=c('p2','q2','r2','2pr','2qr','2pq',1),
                     Count=c('nAA','nBB','nOO','nAO','nBO','nAB','n'))
    knitr::kable(dat,format='markdown',caption = "Comparation of them",align = "c")

Result

library(nloptr)
# Mle 

eval_f0 <- function(x,x1,n.A=28,n.B=24,nOO=41,nAB=70) {

  r1<-1-sum(x1)
  nAA<-n.A*x1[1]^2/(x1[1]^2+2*x1[1]*r1)
  nBB<-n.B*x1[2]^2/(x1[2]^2+2*x1[2]*r1)
  r<-1-sum(x)
  return(-2*nAA*log(x[1])-2*nBB*log(x[2])-2*nOO*log(r)-
           (n.A-nAA)*log(2*x[1]*r)-(n.B-nBB)*log(2*x[2]*r)-nAB*log(2*x[1]*x[2]))
}


# constraint
eval_g0 <- function(x,x1,n.A=28,n.B=24,nOO=41,nAB=70) {
  return(sum(x)-0.999999)
}

opts <- list("algorithm"="NLOPT_LN_COBYLA",
             "xtol_rel"=1.0e-8)
mle<-NULL
r<-matrix(0,1,2)
r<-rbind(r,c(0.2,0.35))# the beginning value of p0 and q0
j<-2
while (sum(abs(r[j,]-r[j-1,]))>1e-8) {
res <- nloptr( x0=c(0.3,0.25),
               eval_f=eval_f0,
               lb = c(0,0), ub = c(1,1), 
               eval_g_ineq = eval_g0, 
               opts = opts, x1=r[j,],n.A=28,n.B=24,nOO=41,nAB=70 )
j<-j+1
r<-rbind(r,res$solution)
mle<-c(mle,eval_f0(x=r[j,],x1=r[j-1,]))
}
r  #the result of EM algorithm
mle #the max likelihood values

Homework 9

3 p204

Problem statement Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list: formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt )

Result

attach(mtcars)
formulas <- list(
  mpg ~ disp,
  mpg ~ I(1 / disp),
  mpg ~ disp + wt,
  mpg ~ I(1 / disp) + wt
)
#1 for loops
f3<- vector("list", length(formulas))
for (i in seq_along(formulas)){
  f3[[i]] <- lm(formulas[[i]], data = mtcars)
}
f3
#2 lapply
la3<-lapply(formulas, function(x) lm(formula = x, data = mtcars))
la3

4

Problem statement Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply().Can you do it without an anonymous function?

Result

set.seed(123)
bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ]
})
# for loops
f4<- vector("list", length(bootstraps))
for (i in seq_along(bootstraps)){
  f4[[i]] <- lm(mpg ~ disp, data = bootstraps[[i]])
}
f4
# lapply without anonymous function
la4<- lapply(bootstraps, lm, formula = mpg ~ disp)
la4

5

Problem statement For each model in the previous two exercises, extract R2 using the function below. rsq <- function(mod) summary(mod)$r.squared

Result:

rsq <- function(mod) summary(mod)$r.squared
#3
sapply(la3, rsq)
sapply(f3, rsq)
#4
sapply(la4,rsq)
sapply(f4,rsq)

3 p213

Problem statement The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.

Result:

set.seed(123)
trials <- replicate(
100,
t.test(rpois(10, 10), rpois(7, 10)),
simplify = FALSE
)
# anonymous function:
sapply(trials, function(x) x[["p.value"]])

6

Problem statement Implement a combination of Map() and vapply() to create an lapply() variant that iterates in parallel over all of its inputs and stores its outputs in a vector (or a matrix). What arguments should the function take?

Result:

#example
options(warn = -1)
testlist <- list(iris, mtcars, cars)
lapply(testlist, function(x) vapply(x, mean, numeric(1)))
#a more specialized function:
lmapply <- function(X, FUN, FUN.VALUE, simplify = FALSE){
  out <- Map(function(x) vapply(x, FUN, FUN.VALUE), X)
  if(simplify == TRUE){return(simplify2array(out))}
  out
}
lmapply(testlist, mean, numeric(1))

Homework 10

4 p365

Problem statement Make a faster version of $\textbf{chisq.test()}$ that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying $\textbf{chisq.test()}$ or by coding from the mathematical definition.

answer

As $\textbf{chisq.test()}$ has a long code, we try a shorter one:

chisq.test2 <- function(x, y){

  # Input
  if (!is.numeric(x)) {
    stop("x must be numeric")}
  if (!is.numeric(y)) {
    stop("y must be numeric")}
  if (length(x) != length(y)) {
    stop("x and y must have the same length")}
  if (length(x) <= 1) {
    stop("length of x must be greater one")}
  if (any(c(x, y) < 0)) {
    stop("all entries of x and y must be greater or equal zero")}
  if (sum(complete.cases(x, y)) != length(x)) {
    stop("there must be no missing values in x and y")}
  if (any(is.null(c(x, y)))) {
    stop("entries of x and y must not be NULL")}

  # compute the theoretical value
  m <- rbind(x, y)#the actual value
  margin1 <- rowSums(m)
  margin2 <- colSums(m)
  n <- sum(m)
  me <- tcrossprod(margin1, margin2) / n #the theoretical value

  # Output
  STATISTIC = sum((m - me)^2 / me)
  dof <- (length(margin1) - 1) * (length(margin2) - 1)#degree of freedom
  p <- pchisq(STATISTIC, df = dof, lower.tail = FALSE)
  return(list(X_squared = STATISTIC, df = dof, `p-value` = p))
}

Check if $\textbf{chisq.text2()}$ returns the same results:

a <- 11:15
b <- c(11,12.5,13.5,14.5,15.5)
m_test <- cbind(a,b)

identical(chisq.test(m_test),chisq.test2(a, b))

The results are not same.

chisq.test(m_test)

chisq.test2(a, b)

By observing the results, we find that there is a slight difference in p value.

Finally compare the run time:

chisq.test2c <- compiler::cmpfun(chisq.test2)

microbenchmark::microbenchmark(
  chisq.test(m_test),
  chisq.test2(a,b),
  chisq.test2c(a,b)
)



5 p365

Problem statement
Can you make a faster version of \textbf{table()} for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?

answer

table2 <- function(x,y){

  x_val <- unique(x)
  y_val <- unique(y)
  mat <- matrix(0L, length(x_val), length(y_val))
  for (i in seq_along(x)) {
    mat[which(x_val == x[[i]]), which(y_val == y[[i]])] <-
      mat[which(x_val == x[[i]]),  which(y_val == y[[i]])] + 1L
  }
  dimnames <- list(x_val, y_val)
  names(dimnames) <- as.character(as.list(match.call())[-1])  # R has names for dimnames... :/
  tab <- array(mat, dim = dim(mat), dimnames = dimnames)
  class(tab) <- "table"
  tab
}

Then check if $\textbf{table2()}$ returns the same results:

x <- c(1, 2, 3, 1, 2, 3)
y <- c(2, 3, 4, 2, 3, 4)
identical(table(x,y), table2(x,y))

Finally compare the run time:

microbenchmark::microbenchmark(table(x,y), table2(x,y))


Miwa1996/StatComp18052 documentation built on May 5, 2019, 11:06 p.m.