Exercise 1

example 1

x<-seq(0,2*pi,length=100)
y1<-sin(x)
y2<-cos(x)
plot(x,y1,type="l",lwd=2,ylab="y")
lines(x,y2,lwd=2,lty=2)
abline(h=0,lwd=2)
abline(v=0,lwd=2)

example 2

set.seed(1958);
y<-rnorm(100)
hist(y,freq=FALSE)
res<-density(y);
lines(res,col="red")

example 3

library(lattice)
data(quakes); 
mini <- min(quakes$depth) ;
maxi <- max(quakes$depth) ;
int <- ceiling((maxi - mini)/9);
inf <- seq(mini, maxi, int) ;
quakes$depth.cat <- factor(floor(((quakes$depth - mini) / int)), labels=paste(inf, inf + int, sep="-")) ;
xyplot(lat ~ long | depth.cat, data = quakes)

example 4

data("pressure");
names(pressure)
lm.summary<-lm(temperature~pressure,data=pressure)
plot(x=pressure$pressure,y=pressure$temperature,xlab="pressure",ylab="temperature")
plot(lm.summary)

example 5

cells<-c(1,16,24,68)
rnames<-c("R1","R2")
cnames<-c("C1","C2")
mymatrix<-matrix(cells,nrow=2,ncol=2,byrow=TRUE,dimnames = list(rnames,cnames))
mymatrix

title: "Homework-18003-2018/09/21" author: "A18003" date: "2018/09/21" output: html_document


3.5

Idea:
1.Generate U~U(0,1)
2.If $F_X$($x_{i-1}$)$<$U$\le$$F_X$($x_{i}$),Then X=$x_i$

set.seed(2)
x<-c(0,1,2,3,4);
p<-c(0.1,0.2,0.2,0.2,0.3);
cp<-cumsum(p);
m<-10^3;
r<-numeric(m);
r<-x[findInterval(runif(m),cp,left.open = TRUE)+1]#"left.open=TRUE,because in findInterval function ,the intervals are open at right and closed at left.
r
ct<-as.vector(table(r));#calulate frequency of the table "r"
ct/sum(ct);

3.7

Idea:
1.$$Generate\quad random\quad numbers\quad U\sim U(0,1)\quad and\quad Y\sim g(.)$$
2.$$If\quad U\le\rho (Y)\quad then\quad accept\quad Y\quad and\quad stop(return X=Y) \quad\rho(x)=\frac{f(x)}{cg(x)}$$
3.$$beta(\alpha,\beta) \quad with\quad pdf\quad f(x)=\frac{x^{\alpha-1}(1-x)^{\beta-1}}{B(\alpha,\beta)}. \g(x)=1,0<x<1 \quad and\quad c=\frac{1}{B(\alpha,\beta)} $$

rnbeta<-function(a,b,N){
  n<-N;
  j<-k<-0;
  y<-numeric(n);
  while(k<n){
    u<-runif(1);
    j<-j+1;
    x<-runif(1) #random variate from g
    if(x^(a-1)*(1-x)^(b-1)>u){
      #we aceept x
 k<-k+1;
 y[k]<-x;}

  }
return(y)};
x<-rnbeta(3,2,1000);
p1 = hist(x,plot = T,freq = FALSE,breaks=seq(0,1,0.01),col="yellow",main = "a-r menthod for generating random number");
d=seq(from=min(x),to=max(x),by=0.01) 
lines(x=d,y=dbeta(d,3,2),lty=2,col="red")#add density curve

3.12

Question:Simulate a continuous Exponential-Gamma mixtrue

n<-1000;
r<-4;
beta<-3;
lambda<-rgamma(n,r,beta);
x<-rexp(n,lambda);
hist(x,plot=T,freq=FALSE,breaks=50,col="skyblue",main="Exponential-Gamma mixture")
lines(density(x,bw=1),col='red',lty=2)

title: "Homework-18003-2018/9/28" author: "A18003" date: "2018/9/28" output: html_document


5.4

Write a function to compute a Monte Carlo estimate of the Beta(3, 3) cdf,and use the function to estimate F(x) for x = 0.1, 0.2, . . ., 0.9. Compare the
estimates with the values returned by the pbeta function in R
$$\Phi(x)=\int^{x}{0}30t^2(1-t)^2dt\let\,\,y=\frac{t}{x},we\,have\,dt=xdy\,\,\,then\\theta=\int^{1}{0}30x^3y^2(1-xy)^2dy$$

x<-seq(0.1,0.9,length = 9)
m<-10000
set.seed(111)
u<-runif(m)
cdf <- numeric(length(x))
for (i in 1:length(x)) {
g<-30*(x[i])^3 *u^2*(1-x[i]*u)^2
cdf[i]<-mean(g) 
}
Phi <- pbeta(x,3,3)
print(round(rbind(x, cdf, Phi), 3))

5.9

The Rayleigh density is $$f(x)=\frac{x}{\sigma^2}e^{-x^2/(2\sigma^2)}\,, x \,\, 0, \sigma > 0.\ \Phi(x)=\int_{0}^{x}\frac{t}{\sigma^2}e^{-t^2/(2\sigma^2)}dt\let\,\,y=\frac{t}{x}??we\,have\,dt=xdy\,\,\,then\\theta=\int_{0}^{1}\frac{xy}{\sigma^2}e^{-(xy)^2/(2\sigma^2)}xdy$$ Implement a function to generate samples from a Rayleigh(??) distribution, using antithetic variables. What is the percent reduction in variance of $\frac{X+X'}{2}$ compared with $\frac{X_1+X_2}{2}$ for independent $X_1, X_2$?

MC.Phi<-function(x,sigma,R= 10000, antithetic = TRUE) {
u <- runif(R/2)
if (!antithetic)
v<- runif(R/2)
else 
v<-1-u
w<-c(u, v)
cdf <- numeric(length(x))
for (i in 1:length(x)) {
g <-x[i]*w/(sigma)^2*exp((-(x[i]*w)^2/(2*sigma*sigma)))*x[i]
cdf[i] <- mean(g)
}
return(cdf)
}
x <- seq(.1,2.5,length=5);
pRayleigh<-function(x,sigma){
  s<-sigma;
  p<-numeric(length(x));
  intergrand<-function(x){
    x/(s^2)*exp((-x^2/(2*(s^2))))};
  for(i in 1:length(x)){
  p[i]<-integrate(intergrand,0,x[i])$value;
  }
  return(p)
}
Phi<-pRayleigh(x,sigma=2)
set.seed(123)
MC1<- MC.Phi(x,sigma=2,anti=FALSE) #for (X1+X2)/2 which X1,X2 is independent
set.seed(123)
MC2<- MC.Phi(x,sigma=2,anti=TRUE)  #for antithetic variables (X+X')/2
print(round(rbind(x, MC1, MC2, Phi),5))
m <- 1000
MC1 <- MC2 <- numeric(m)
x <- 1.95
for (i in 1:m) {
MC1[i] <- MC.Phi(x,2,R = 1000, anti = FALSE)
MC2[i] <- MC.Phi(x,2,R = 1000,anti=TRUE)
}
print(sd(MC1))
print(sd(MC2))
print((var(MC1) - var(MC2))/var(MC1))

The antithetic variable approach achieved approximately 91.9% reduction in variance at x = 1.95.and $\sigma=2$

5.13

Find two importance functions f1 and f2 that are supported on $(1,\infty )$ and are ??close?? to $$g(x)=\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx\quad x > 1.$$ Which of your two importance functions should produce the smaller variance in estimating $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx$$ by importance sampling? Explain.
Solve:
let $f_1(x)=\frac{1}{\sqrt{2\pi}}e^{-x^2/2}\quad -\infty<x<+\infty$
$\quad f_2(x)=e^{-x+1}\quad 1<x<+\infty$

x<-seq(1,10,length=100)
y<-1/sqrt(2*pi)*exp(-x^2/2);
z<-exp(-x+1)
plot(x,y,col="red",lty=1,xlim = c(0,10))
lines(x,z,col="green",lty=1,xlim=c(0,10))
y1<-x^2;
z1<-y1*y/z;
plot(x,y1,col="red",lty=1,xlim = c(0,10))
lines(x,z1,col="green",lty=1,xlim=c(0,10))

we can see both f1 and f2 are close to g

5.14

Obtain a Monte Carlo estimate of $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx$$ by importance sampling.

m <- 10000
set.seed(111)
theta.hat <- se <- numeric(2)
g <- function(x) {
exp(-x^2/2)*x^2/(sqrt(2*pi))*(x >1)
}
x <- rnorm(m) #using f1
fg <- g(x)/dnorm(x)
theta.hat[1] <- mean(fg)
se[1] <- sd(fg)
set.seed(222)
u <- runif(m) #f2, inverse transform method
x <- 1- log(1 - u)
fg <- g(x) / (exp(-x+1))
theta.hat[2] <- mean(fg)
se[2] <- sd(fg)
print(rbind(theta.hat, se))

we can see f2 has smaller variance

title: "Homework-18003-2018/10/12" author: "A18003" date: "2018/10/12" output: html_document


6.9

Let X be a non-negative random variable with$\mu =E[X] < \infty$. For a random sample$x_1,...,x_n$from the distribution of X, the Gini ratio is defined by $$G=\frac{1}{2n^2\mu}\sum^{n}{j=1}\sum^{n}{i=1}|x_i-x_j|$$ The Gini ratio is applied in economics to measure inequality in income distribution.Note that G can be written in terms of the order statistics$x_{(i)}$as $$G=\frac{1}{n^2\mu}\sum^{n}{i=1}(2i-n-1)x{(i)}.$$ f the mean is unknown, let $\hat G$ be the statistic G with $\mu$ replaced by$\bar x$. Estimate by simulation the mean, median and deciles of $\hat G$ if X is standard lognormal. Repeat the procedure for the uniform distribution and Bernoulli(0.1). Also construct density histograms of the replicates in each case.

n=50
m=1000
G_sam1<-numeric(m)
G_sam2<-numeric(m)
G_sam3<-numeric(m)
set.seed(110)
for(i in 1:m)
{
  x<-sort(rlnorm(n,0,1))
  y<-sort(runif(n,0,1))
  z<-sort(rbinom(n,1,0.5))
  J=2*seq(1:n)-n-1
  G_sam1[i]=(J%*%x)/(n^2*mean(x))
  G_sam2[i]=(J%*%y)/(n^2*mean(y))
  G_sam3[i]=(J%*%z)/(n^2*mean(z))
}
log_norm<-c(mean(G_sam1),median(G_sam1),quantile(G_sam1,seq(0,1,0.1)))
unif_norm<-c(mean(G_sam2),median(G_sam1),quantile(G_sam2,seq(0,1,0.1)))
Bern_norm<-c(mean(G_sam2),median(G_sam1),quantile(G_sam3,seq(0,1,0.1)))
A=rbind(log_norm,unif_norm,Bern_norm)
colnames(A)=c("mean","median",names(quantile(G_sam1,seq(0,1,0.1))))
print(A)
hist(G_sam1,freq=FALSE,col="skyblue",prob=TRUE,main="G density histograms for lognorm")
hist(G_sam2,freq=FALSE,col="skyblue",prob=TRUE,main="G density histograms for  uniform")
hist(G_sam3,freq=FALSE,col="skyblue",prob=TRUE,main="G density histograms for  Bernoulli")

6.10

Construct an approximate 95% confidence interval for the Gini ratio $\gamma = E[G]$ if X is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.

n=20
m=100
G_sam1<-numeric(m)
alpha=0.025
UCL_low<-numeric(1000)
UCL_upp<-numeric(1000)
Mean<-numeric(1000)
for(k in 1:1000)
{ for(i in 1:m) #generate m replitcates
 {
  x<-sort(rlnorm(n,0,1))
  J=2*seq(1:n)-n-1
  G_sam1[i]=(J%*%x)/(n^2*mean(x))
 }

UCL_low[k]<-  mean(G_sam1)-qnorm(1-alpha)*sd(G_sam1)/sqrt(m)
UCL_upp[k]<-  mean(G_sam1)+qnorm(1-alpha)*sd(G_sam1)/sqrt(m)
Mean[k]<-mean(G_sam1)
}
fin_mean<-mean(Mean)
k=sum(UCL_low<fin_mean &UCL_upp>fin_mean)
k/1000

6B

Tests for association based on Pearson product moment correlation $\rho$, Spearman??s rank correlation coefficient $\rho_s$, or Kendall??s coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_s$ or $\tau$, are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X, Y ) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.

library(MASS)
mu0<-c(1,1)
n<-20
m <- 1000
ptest<-stest<-ktest<-numeric(m)
pho<-seq(0.1,0.9,length=21)
#alternatives
power1 <- numeric(21)
power2 <- numeric(21)
power3 <- numeric(21)
set.seed(122)
for (i in 1:21) {
sigma_i <-matrix(c(1,0.1+0.04*(i-1),0.1+0.04*(i-1),1),nrow = 2)
for(k in 1: m){
x <- mvrnorm(n,mu0,sigma_i)
ptest[k]<- cor.test(x[,1],x[,2],alternative = "greater",method = "pearson")$p.value
stest[k]<-cor.test(x[,1],x[,2],alternative = "greater",method = "spearman")$p.value
ktest[k]<-cor.test(x[,1],x[,2],alternative = "greater",method = "kendall")$p.value
}
power1[i] <- mean(ptest<=0.05)
power2[i] <- mean(stest<=0.05)
power3[i] <- mean(ktest<=0.05)
} 
plot(pho, power1,type="l",col="blue")
lines(pho,power2,type="l",col="red")
lines(pho,power3,type="l",col="yellow")

for X Y is independent

library(MASS)
m=1000
n=20
mean<-c(1,1)
sigma=matrix(c(1,0,0,1),2,2)
p_pearson<-numeric(m)
p_Spearman<-numeric(m)
p_Kendall<-numeric(m)
for(i in 1:m)
{
  sample=mvrnorm(n,mean,sigma)
  x=sample[,1]
  y=sample[,2]
  test1<-cor.test(x,y,method="pearson")
  test2<-cor.test(x,y,method="kendall")
  test3<-cor.test(x,y,method="spearman")
  p_pearson[i]<-test1$p.value
  p_Spearman[i]<-test2$p.value
  p_Kendall[i]<-test3$p.value
}

ratio=c(mean(p_pearson<0.05),mean(p_Spearman<0.05),mean(p_Kendall<0.05))
names(ratio)<-c("peason","Speqrman","Kendall")
print(ratio)

As shown from the result,at least one of the nonparaments test have better empirical power than correlation test.

title: "Homework-2018.11.02" author: "A18003" date: "2018/11/02" output: html_document


7.1

Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.

library(bootstrap);
attach(law);#attach data
n = length( LSAT )
R.hat = cor( LSAT,GPA )
R.jack = numeric ( n )
for (i in 1:n)
  { R.jack[i] = cor( LSAT[-i],GPA[-i] ) } #jackknife
bias.jack = (n-1)*( mean(R.jack) - R.hat )
R.bar = mean(R.jack)
se.jack = sqrt( (n-1)*mean( (R.jack-R.bar)^2 ) )

we find r bias.jack for the bias and r se.jack for std.error

7.5

Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures$\frac{1}{\lambda} $by the standard normal, basic, percentile, and BCa methods. Compare the intervals and explain why they may differ.

require(boot); 
attach(aircondit)
x=hours
gaptime.hat = mean(x) #MLE of 1/lambda
B = 5000  #no. boostrap resamples
set.seed(5)
exc75.boot = boot( data=aircondit,statistic=function(x,i){mean(x[i,])}, R=B )

The bootstrap summary reports:

exc75.boot

we can see the MLE is $\frac{1}{\hat \lambda}$=r exc75.boot$t0,with estimated std.error of$\hat{se}_{boot}$=37.86823
when we calculate the confidence interval by BCa,we need the empirical influence values of the statistic of interest for the observed data.
So

einf.jack = empinf(exc75.boot, type='jack')#where the call to empinf() is used to generate an empirical influence object using the usual jackknife. 
boot.ci(exc75.boot, type=c('norm','basic','perc','bca'), L=einf.jack)

We see the intervals are quite disparate. A primary reason is that the bootstrap distribution is still skewed, affecting the simpler methods and their appeal to the Central Limit Theorem. For example

hist(exc75.boot$t, main='', xlab=expression(1/lambda), prob=T)
points(exc75.boot$t0, 0, pch = 19,col="red")

(the solid dot, ??, indicates the MLE). The BCa interval incorporates an acceleration adjustment for skew, and may be preferred here.

7.8

Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat \theta$.

require(bootstrap); 
attach(scor);#attach data
theta = function(x){
eigen(cov(x))$values[1]/sum(eigen(cov(x))$values)
} #end function
n=length(scor[,1]);
x=as.matrix(scor);
lambda.hat = eigen(cov(x))$values
theta.hat = lambda.hat[1]/sum(lambda.hat)
theta.jack=numeric(n)
for (i in 1:n) {theta.jack[i]=theta( x[-i,] ) }#jackknife
bias.jack=(n-1)*(mean(theta.jack)-theta.hat)
theta.bar=mean(theta.jack)
se.jack = sqrt((n-1)*mean( (theta.jack-theta.bar)^2 ))
print( list(theta.hat=theta(scor), bias=bias.jack, se=se.jack) )
detach(package:bootstrap)

we see $\hat{bias}{jack}(\hat\theta)$ =r bias.jack,and $\hat{se}{jack}(\hat\theta)$=r se.jack

7.11

In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.

library(DAAG);
attach(ironslag);
n <- length(magnetic) #in DAAG ironslag
e1 <- e2 <- e3 <- e4 <- 0;
# for n/2-fold cross validation
# fit models on leave-two-out samples
for (k in 1:(n-1)) {
a<-c(k,(k+1))
y <- magnetic[-a]
x <- chemical[-a]
J1 <- lm(y ~ x)
ee1 = magnetic[a]- predict( J1, newdata=data.frame(x=chemical[a]) )
e1[k]<-sum(ee1^2)
J2 <- lm(y ~ x + I(x^2))
ee2 = magnetic[a]- predict( J2, newdata=data.frame(x=chemical[a]) )
e2[k]<-sum(ee2^2)
J3 <- lm(log(y) ~ x)
ee3 = magnetic[a]- exp(predict( J3, newdata=data.frame(x=chemical[a])))
e3[k]<-sum(ee3^2)
J4 <- lm(log(y) ~ log(x))
ee4 = magnetic[a]- exp(predict( J4, newdata=data.frame(x=chemical[a])))
e4[k]<-sum(ee4^2)
}
print( list(mse1=mean(e1/2),mse2=mean(e2/2),mse3=mean(e3/2),mse4=mean(e4/2)) )

We can see Model 2 has the least prediction error 17.96998.
So according to the prediction error criterion, Model 2, the quadratic model,would be the best fit for the data.


title: "Homwork-2018-11-09" author: "A18003" date: "2018/11/09" output: html_document


knitr::opts_chunk$set(echo = TRUE)

8.1

Implement the two-sample Cram??er-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2 Cram??er-von Mises where the Cram??er-von Mises distance is defined by $$W_2=\frac{mn}{(m+n)^2}[\sum^n_{i=1}(F_n(x_i)-G_m(x_i))^2+\sum^m_{j=1}(F_n(y_j)-G_m(y_j))^2]$$

cvm<-function(x,y,N){
  reps<-numeric(N);#permutation numbers
  n<-length(x);
  m<-length(y);
  f<-ecdf(x);
  g<-ecdf(y);
  z<-c(x,y);
  t<-numeric(N);
  t0<-m*n/(m+n)^2*(sum((f(x)-g(x))^2)+sum((f(y)-g(y))^2));
  for(i in 1:N){
    k<-sample(length(z),size=n,replace = FALSE);
    x1<-z[k];
    y1<-z[-k];
    f1<-ecdf(x1);
    g1<-ecdf(y1);
    t[i]<-m*n/(m+n)^2*(sum((f1(x)-g1(x))^2)+sum((f1(y)-g1(y))^2))
  } 
  return(c(t0,t));
}
#example 1
set.seed(2)
attach(chickwts)
x1 <- sort(as.vector(weight[feed == "soybean"]));
y1 <- sort(as.vector(weight[feed == "linseed"]));
cvm1<-cvm(x1,y1,999);
p1 <- mean(cvm1 >= cvm1[1]);
p1
hist(cvm1, main = "Cram??er-von Mises test", freq = FALSE, xlab = "w (p = 0.385)",breaks = "scott");
points(cvm1[1], 0, cex = 1, pch = 16);
detach(chickwts)

from the table we can accept x and y have the same distribution.

experiment

Design experiments for evaluating the performance of the NN,energy, and ball methods in various situations.
> Unequal variances and equal expectations
> Unequal variances and unequal expectations
> Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
> Unbalanced samples (say, 1 case versus 10 controls)
> Note: The parameters should be choosen such that the powers are distinguishable (say, range from 0.3 to 0.9).

example 1

library(RANN)
library(energy)
library(Ball)
library(boot)
library(ggplot2)
set.seed(123)
m <- 50; k<-3; p<-2;
n1<- n2 <- 50; R<-999; n <- n1+n2; N = c(n1,n2)

Tn <- function(z, ix, sizes,k) {
n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2
if(is.vector(z)) z <- data.frame(z,0);
z <- z[ix, ];
NN <- nn2(data=z, k=k+1) # what's the first column?
block1 <- NN$nn.idx[1:n1,-1]
block2 <- NN$nn.idx[(n1+1):n,-1]
i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
(i1 + i2) / (k * n)
}

eqdist.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k)
  ts <- c(boot.obj$t0,boot.obj$t)
  p.value <- mean(ts>=ts[1])
  list(statistic=ts[1],p.value=p.value)
}

p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <- matrix(rnorm(n1*p,mean = 1,sd = 0.6),ncol=p);
  y <- matrix(rnorm(n2*p,mean = 1,sd = 0.8),ncol=p);
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

example 2

set.seed(122)
m <- 50; k<-3; p<-2;
n1<- n2 <- 50; R<-999; n <- n1+n2; N = c(n1,n2)

Tn <- function(z, ix, sizes,k) {
n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2
if(is.vector(z)) z <- data.frame(z,0);
z <- z[ix, ];
NN <- nn2(data=z, k=k+1) # what's the first column?
block1 <- NN$nn.idx[1:n1,-1]
block2 <- NN$nn.idx[(n1+1):n,-1]
i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
(i1 + i2) / (k * n)
}

eqdist.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k)
  ts <- c(boot.obj$t0,boot.obj$t)
  p.value <- mean(ts>=ts[1])
  list(statistic=ts[1],p.value=p.value)
}

p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <- matrix(rnorm(n1*p,mean = 0.3,sd = 0.6),ncol=p);
  y <- matrix(rnorm(n2*p,mean = 0.4,sd = 0.8),ncol=p);
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

example 3

set.seed(124)
m <- 50; k<-3; p<-2; 
n1 <- n2 <- 20; R<-999; n <- n1+n2; N = c(n1,n2)

Tn <- function(z, ix, sizes,k) {
n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2
if(is.vector(z)) z <- data.frame(z,0);
z <- z[ix, ];
NN <- nn2(data=z, k=k+1) # what's the first column?
block1 <- NN$nn.idx[1:n1,-1]
block2 <- NN$nn.idx[(n1+1):n,-1]
i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
(i1 + i2) / (k * n)
}

eqdist.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k)
  ts <- c(boot.obj$t0,boot.obj$t)
  p.value <- mean(ts>=ts[1])
  list(statistic=ts[1],p.value=p.value)
}

p.values <- matrix(NA,m,3)
for(i in 1:m){
  # t distribution with 1 df (heavy-tailed distribution)
  x <- matrix(rt(n1*p,df = 1),ncol=p); 
  #bimodel distribution (mixture of two normal distributions)
  y <- cbind(rnorm(n2,mean = 0.4),rnorm(n2,mean = 0.5));
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

example 4

set.seed(125)
m <- 50; k<-3; p<-2; 
n1 <- 10;n2 <- 100;R<-999; n <- n1+n2; N = c(n1,n2)

Tn <- function(z, ix, sizes,k) {
n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2
if(is.vector(z)) z <- data.frame(z,0);
z <- z[ix, ];
NN <- nn2(data=z, k=k+1) # what's the first column?
block1 <- NN$nn.idx[1:n1,-1]
block2 <- NN$nn.idx[(n1+1):n,-1]
i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
(i1 + i2) / (k * n)
}

eqdist.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k)
  ts <- c(boot.obj$t0,boot.obj$t)
  p.value <- mean(ts>=ts[1])
  list(statistic=ts[1],p.value=p.value)
}

p.values <- matrix(NA,m,3)
for(i in 1:m){
  x <- c(rnorm(n1,mean = 1,sd = 1)); # n1 = 10
  y <- c(rnorm(n2,mean = 2,sd = 2)); # n2 = 100
  z <- c(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
  p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value
}
alpha <- 0.05;
pow <- colMeans(p.values<alpha)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

9.3

Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchy distribution (see qcauchy or qt with df=1). Recall that a Cauchy(??, ??) distribution has density function $$f(x)=\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)}$$ The standard Cauchy has the Cauchy(?? = 1, ?? = 0) density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)

f903 <- function(x, eta=0, theta=1) {
stopifnot( theta > 0 )
return( 1 / (pi*theta * (1 + ((x-eta)/theta)^2)) )
} #end function
set.seed(903)
m <- 10000
sigma = 1 #try sigma = 1 for proposal scale
x <- numeric(m)
x[1] <- rnorm(1,0,sigma) #initialize with X0~N(0,sigma^2)
k <- 0
u <- runif(m)
for (i in 2:m) {
xt <- x[i-1]
y <- rnorm(1, mean = xt, sd = sigma)
num <- f903(y) * dnorm(xt, mean = y, sd = sigma)
den <- f903(xt) * dnorm(y, mean = xt, sd = sigma)
if (u[i] <= num/den) x[i] <- y else {
x[i] <- xt
k <- k+1 #y is rejected
} #end if/else
} #end for loop
print(k/m) #rejection rate

(One could just use dcauchy(x) instead of the constructed function f903(x) for the Cauchy p.d.f.) The rejection rate is given asr k/m which is rather low (i.e., higher-than-desired acceptance). Setting the burn-in to bo = 1000 draws, the retained chain is found via

b0 = 1000 #burn-in
index = (b0+1):m
y1 = x[index] #chain after burn-in

To compare deciles with the theoretical Cauchy(0,1) p.d.f., sample R code is

p10 = seq(.1,.9, .1)
round( rbind( quantile(y1, p10), qcauchy(p10) ), 3 )

which produces not-unreasonable comparisons, but with some discrepancies in the tails: The trace plot, from

plot( y1~index, type="l", main="", ylab="x" )

Interestingly, a histogram of the retained chain (using b0 = 1000), with the target Cauchy(0,1) p.d.f., from the R code

hist( y1, prob=T, main='', xlab='x', ylim=c(0,.35), breaks=50 )
xarg = seq( min(y1), max(y1), 0.1 )
lines( xarg, f903(xarg), ylim=c(0,.35) )

Moving to, say, m = 50,000 draws leads to slight, but not satiating improvement (try it!). Living up to its reputation, the Cauchy is a difficult distribution with which to operate.

9.6

Rao [220, Sec. 5g] presented an example on genetic linkage of 197 animals in four categories (also discussed in [67, 106, 171, 266]). The group sizes are278 Statistical Computing with R (125, 18, 20, 34). Assume that the probabilities of the corresponding multinomial distribution are $$(\frac{1}{2}+\frac{\theta}{4},\frac{1-\theta}{4},\frac{1-\theta}{4},\frac{\theta}{4})$$ Estimate the posterior distribution of ?? given the observed sample, using one of the methods in this chapter.

solution

Set the prior on $\theta$??? to ??? ~ U(0,1) so that $\pi(\theta???) = I_{(0,1)}(\theta)$. The multinomial likelihood is$f(X|\theta???)\propto \prod ip_i(\theta)^{x_i}/x_i!$, where $p_1(\theta)=(2+\theta)/4,p_2(\theta???)=p_3(\theta)=(1-\theta)/4, and\,\,\, p_4(\theta???) =\theta ???/4$. Combining these via Bayes?? Rule produces the posterior $$f (\theta|X)=\pi(\theta)f(X|\theta)/f(X)\propto(2+\theta)^{x_1}(1-\theta)^{x_2+x_3}\theta^{x_4}$$ Notice that posterior specification to proportionality is all that is needed here, since any constants in$f(\theta???|X)$ will cancel in the construction of $r(\theta_t,Y)$ in the McMC sampler.
For the proposal density, we usually desire a p.d.f. $g(y|\theta)$ defined over a space the same as that for ???; here this is 0 < y < 1. The simple choice of$ g(y|\theta)=I
{(0,1)}(y)$, i.e., a uniform proposal independent of ???, corresponds to an independence sampler. However,independence samplers are not typically recommended. Instead, try a beta p.d.f.proposal with expected value equal to ???t. Different possibilities exist; one is $Y|\theta \sim\beta(\theta_t/{1-\theta_t},1)$, where$E[Y|\theta_t]=[\theta_???t/{1?C\theta_t}]/[1+(\theta_???t/{1?C\theta_???t})] =\theta_t (For 0 < \theta <1)$, this is a right-skewed, decreasing p.d.f.) Sample R code for an McMC sampler from$ f(\theta???|X)$

f906 <- function( th,x ) {
if (th<0 || th>1 ) return (0)
(2+th)^x[1] * (1-th)^(x[2]+x[3]) * th^x[4]
} #end function
xdata = c( 125, 18, 20, 34 ) #observed multinom. data
m = 10000
set.seed(906)
th = numeric(m)
th[1] = runif(1) #initialize: sample from prior on theta
k = 0
u = runif(m)
##employ skewed beta proposal density
for (t in 2:m) {
xt = th[t-1]
alph = xt/(1-xt)
y <- rbeta(1, shape1=alph, shape2=1 )
numer = f906( y,xdata ) * dbeta( xt, y/(1-y), 1)
denom = f906( xt,xdata ) * dbeta( y, alph, 1)
if ( u[t] <= numer/denom )
th[t] = y else {
th[t] = th[t-1]
k = k + 1
} #end if/else
} #end for loop

Using this code, the posterior sample of 10,000 points resides in the vector th; its raw trace plot is found via

plot( th, type="l", ylim=range(th),
xlab=bquote(theta), ylab="posterior" )

This produces
which appears somewhat variable, but not otherwise unacceptable. A burn-in of perhaps bo = 2000 will likely suffice. The consequent histogram, found via

hist( th[2001:m], prob=T, breaks="Scott",
xlab=bquote(theta), ylab='posterior', main="" )

shows a generally unimodal, slightly left-skewed sample.
The associated posterior mean (a Bayesian estimate) of ???, and the corresponding posterior ??cell?? probabilities are found via

theta.hat = mean( th[2001:m] )
theta.hat

and

print(c( 0.5 + theta.hat/4, (1 - theta.hat)/4,
(1 - theta.hat)/4, theta.hat/4) ) #p.hat vector

respectively.


title: "Homework-2018-11-23" author: "A18003" date: "2018/11/23" output: html_document


11.4

Find the intersection points A(k) in $(0, \sqrt{k})$ of the curves $$S_{k-1}(a)=P(t(k-1)>\sqrt{\frac{a^2(k-1)}{k-a^2}})$$ and$$S_{k}(a)=P(t(k)>\sqrt{\frac{a^2k}{k+1-a^2}})$$ for k = 4 : 25, 100, 500, 1000, where t(k) is a Student t random variable with k degrees of freedom.

solution:

The upper-tail t-probabilities are found via the pt() function with the lower.tail=F option. The intersection of the two upper-tail functions $S_{k-1}(a)$and$S_k(a)$ occurs when their difference is zero. A quick plot of selected differences $S_{k-1}(a)-S_k(a)$ shows that the likely solutions occur in the interval$1<a<2$, so use this as the capture interval for application of Brent??s method

k = c( 4:25, 100, 500, 1000 )
object = function( a, df ){
 a2 = a^2
 arg = sqrt( a2*df/(df + 1 - a2) )
 Sk = pt( q=arg, df=df, lower=F)
 arg = sqrt( a2*(df-1)/(df - a2) )
 Skm1 = pt( q=arg, df=df-1, lower=F)
 return( Sk-Skm1 )
 } #end function
for ( i in 1:length(k) ) {
 print( c( k[i], uniroot(object, lower=1, upper=2, df=k[i])$root ) )
 } #end for loop 

title: "Homework-2018-12-07" author: "A18003" date: "2018/12/07" output: html_document


p204 Ex3

Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list:
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)

attach(mtcars)
formulas <- list(
  mpg ~ disp,
  mpg ~ I(1 / disp),
  mpg ~ disp + wt,
  mpg ~ I(1 / disp) + wt
)
#1 for loops
lf_3<- vector("list", length(formulas))
for (i in seq_along(formulas)){
  lf_3[[i]] <- lm(formulas[[i]], data = mtcars)
}
#2 lapply
la_3<-lapply(formulas, function(x) lm(formula = x, data = mtcars))

p204 Ex4

Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply().Can you do it without an anonymous function?

set.seed(123)
bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ]
})
# for loops
lf_4<- vector("list", length(bootstraps))
for (i in seq_along(bootstraps)){
  lf_4[[i]] <- lm(mpg ~ disp, data = bootstraps[[i]])
}
# lapply without anonymous function
la_4<- lapply(bootstraps, lm, formula = mpg ~ disp)

p204 Ex5

For each model in the previous two exercises, extract R2 using the function below. rsq <- function(mod) summary(mod)$r.squared

rsq <- function(mod) summary(mod)$r.squared
#For the models in exercise 3:
sapply(la_3, rsq)
sapply(lf_3, rsq)
#For the models in exercise 4:
sapply(la_4,rsq)
sapply(lf_4,rsq)

P213 Ex3

The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.

set.seed(3)
trials <- replicate(
100,
t.test(rpois(10, 10), rpois(7, 10)),
simplify = FALSE
)
# anonymous function:
sapply(trials, function(x) x[["p.value"]])
# without anonymous function:
sapply(trials, "[[", "p.value")

title: "Homework-18003-2018/12/14" author: "A18003" date: "2018/12/14" output: html_document


Ex 4

Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test() or by coding from the mathematical definition

options(warn = -1)
chisq.test1<- function(x, y){
  #?ж?????ֵ?Ƿ?????
  if (!is.numeric(x)) {
    stop("x must be numeric")}
  if (!is.numeric(y)) {
    stop("y must be numeric")}
  if (length(x) != length(y)) {
    stop("x and y must have the same length")}
  if (length(x) <= 1) {
    stop("length of x must be greater one")}
  if (any(c(x, y) < 0)) {
    stop("all entries of x and y must be greater or equal zero")}
  if (sum(complete.cases(x, y)) != length(x)) {
    stop("there must be no missing values in x and y")}
  if (any(is.null(c(x, y)))) {
    stop("entries of x and y must not be NULL")}
  #????
  m <- rbind(x, y)
  margin1 <- rowSums(m)
  margin2 <- colSums(m)
  n <- sum(m)
  me <- tcrossprod(margin1, margin2) / n
  x_stat = sum((m - me)^2 / me)#chisqͳ??��
  dof <- (length(margin1) - 1) * (length(margin2) - 1)#???ɶ? 
  p <- pchisq(x_stat, df = dof, lower.tail = FALSE)#pֵ
  return(list(x_stat = x_stat, df = dof, `p-value` = p))
}
#check
a=31:35;
b=c(31,33,35,37,39);
m<-cbind(a,b)
chisq.test1(a,b)
chisq.test(m)
microbenchmark::microbenchmark(
  chisq.test(m),
  chisq.test1(a, b))

Ex 5

Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?

table2 <- function(x, y) {
  x_val <- unique(x)
  y_val <- unique(y)
  mat <- matrix(0L, length(x_val), length(y_val))
  for (i in seq_along(x)) {
    mat[which(x_val == x[[i]]), which(y_val == y[[i]])] <-
      mat[which(x_val == x[[i]]),  which(y_val == y[[i]])] + 1L
  }
  dimnames <- list(x_val, y_val)
  names(dimnames) <- as.character(as.list(match.call())[-1])  # R has names for dimnames... :/
  tab <- array(mat, dim = dim(mat), dimnames = dimnames)
  class(tab) <- "table"
  tab
}
a <- c(1, 2, 3, 1, 2, 3)
b <- c(2, 3, 4, 2, 3, 4)
identical(table(a, b), table2(a, b))
microbenchmark::microbenchmark(table(a, b), table2(a, b))
c<-c(1,2,3,4,5)
identical(table(c, c), table2(c, c))
microbenchmark::microbenchmark(table(c, c), table2(c, c))


qyc1996/StatComp18003 documentation built on May 5, 2019, 11:08 p.m.