title: "Homework-2018.09.14" author: "By 18085" output: html_document


Question

Write a .Rmd file to implement at least three examples of diffrernt types in the above books

Answer

Example1

x=1:30   
y=matrix(x,nrow=6)
y

Example2

m=seq(1,5.5,0.5)  
l=matrix(m,nrow=2)
colnames(l)=c("a","b","c","d","e")
knitr::kable(l) 

Example3

n=rnorm(10)    
z=1:10
plot(n~z)

title: "Homework-2018.09.21" author: "By 18085" output: html_document: default


Question

Exercises 3.5,3.7 and 3.12

Answer

question 3.5

idea:

x is a discrete distribution function,so we can use the inverse transform $F_{x}^{-1}\left ( u \right )= x_{i}$ for $F_{x}\left ( x_{i-1} \right )< u\leq F_{x}\left ( x_{i} \right )$.Then we generate $U\sim U\left ( 0,1 \right )$.So we can get the empirical probabilities.At last,I construct a matrix to show the difference between the empirical with the theoretical probabilities.

answer

x=c(0,1,2,3,4)
p=c(0.1,0.2,0.2,0.2,0.3)
cp=cumsum(p)
 m <- 1e3
r=numeric(m)
r <- x[findInterval(runif(m),cp)+1]
ct <- as.vector(table(r))
t=ct/sum(ct)
q=ct/sum(ct)/p
k=rbind(t,p,q)
colnames(k)=c("x=0","x=1","x=2","x=3","x=4")
rownames(k)=c("the empirical distribution","the theoretical distribution","the rate")
k

R code

x=c(0,1,2,3,4)
p=c(0.1,0.2,0.2,0.2,0.3)
cp=cumsum(p)
 m <- 1e3
r=numeric(m)
r <- x[findInterval(runif(m),cp)+1]
ct <- as.vector(table(r))
t=ct/sum(ct)
q=ct/sum(ct)/p
k=rbind(t,p,q)
colnames(k)=c("x=0","x=1","x=2","x=3","x=4")
rownames(k)=c("the empirical distribution","the theoretical distribution","the rate")
k

question 3.7

idea:

Beta(a,b)=$\int_{0}^{1}x^{a-1}(1-x)^{b-1}$,so the distribution function of beta(a,b) is $f\left ( x \right )= \frac{a(a+b-1)}{b-1}x^{a-1}\left ( 1-x \right )^{b-1}$,we can let $g\left ( x \right )= x$,$0< x< 1$,and $c=\frac{a(a+b-1)}{b-1}$.Then we use the acceptance-rejection method,generate random numbers $U\sim U\left ( 0,1 \right )$ and $Y\sim g\left ( \right )$.We can create the function to generate a random sample of size n from the Beta(a,b) distribution.Then make a=3,b=2,n=1000.

answer

f<-function(a,b,n){
  j=w=0
  y=numeric(n)
  while(w<n){
    u <- runif(1)
    j <- j + 1 
    x1 <- runif(1)
    if (x1^(a-1) * (1-x1)^(b-1) > u) {
      w <- w + 1
      y[w] <- x1
    }
  }
  y
}
y=f(3,2,1000)
hist(y)
curve(dbeta(x,3,2)*1000/12,add=T,col="green")

R code

f<-function(a,b,n){
  j=w=0
  y=numeric(n)
  while(w<n){
    u <- runif(1)
    j <- j + 1 
    x1 <- runif(1)
    if (x1^(a-1) * (1-x1)^(b-1) > u) {
      w <- w + 1
      y[w] <- x1
    }
  }
  y
}
y=f(3,2,1000)
hist(y)
curve(dbeta(x,3,2)*1000/12,add=T,col="green")

question 3.12

idea:

we know r=4 and $\beta = 2$,so We can bring the parameters into the distribution function.Get the distribution function of y about r and $\beta$.At last,we generate 1000 random observations from this mixture.

answer

n=1000
r=4
beta1=2
t=rgamma(n,r,beta1)
x=rexp(n,t)
hist(x)
barplot(x)

R code

n=1000
r=4
beta1=2
t=rgamma(n,r,beta1)
x=rexp(n,t)
hist(x)
barplot(x)

title: "Homework-2018.09.28" author: "By 18085" output: html_document: default


Question

Exercises 5.4, 5.9, 5.13, and 5.14

Answer

question 5.4

Write a function to compute a Monte Carlo estimate of the Beta(3, 3) cdf, and use the function to estimate F(x) forx =0.1,0.2,...,0.9. Compare the estimates with the values returned by the pbeta function in R.

idea:

First,we know the Beta(3,3) distribution function is $F(x)=\int^{x}{0}\frac{1}{B(3,3)}t^{2}(1-t)^{2}\,dt$.From the page,we know if a and b are ???nite, then$\int{a}^{b}h\left ( x \right )dx=\int_{a}^{b}g\left ( x \right )\frac{1}{b-a}dx=E\left [ g\left ( X \right ) \right ]$,where $X\sim U\left ( 0,1 \right )$,$g\left ( x \right )=(b-a)h(x)$.Use a frequency to approximation the expectation (Strong Law of Large Number):$\frac{1}{m}\sum_{i=1}^{m}g(X_{i})$.

answer:

f=function(x){
  m=10000
  t=runif(m,min=0,max=x)
  th=mean(1/beta(3,3)*t^(2)*(1-t)^(2)*x)
}
x=seq(0.1,0.9,by=0.1)
MC=numeric(9)
for(i in 1:9){
  result=f(x[i])
  MC[i]=result[[1]]
}
pBeta=pbeta(x,3,3)
m=rbind(MC,pBeta)
knitr::kable(m,col.names=x)
matplot(x,cbind(MC,pBeta),col=1:2,pch=1:2,xlim=c(0,1))
legend("topleft", inset=.05,legend=c("MC","pbeta"),col=1:2,pch=1:2)

R code

 f=function(x){
  m=10000
  t=runif(m,min=0,max=x)
  th=mean(1/beta(3,3)*t^(2)*(1-t)^(2)*x)
}
x=seq(0.1,0.9,by=0.1)
MC=numeric(9)
for(i in 1:9){
  result=f(x[i])
  MC[i]=result[[1]]
}
pBeta=pbeta(x,3,3)
m=rbind(MC,pBeta)
knitr::kable(m,col.names=x)

question 5.9

The Rayleigh density [156, (18.76)] is $f(x)=\frac{x}{\sigma^2}e^{-x^2/(2\sigma^2)},x\ge0,\sigma>0$. Implement a function to generate samples from a Rayleigh($\sigma$) distribution, using antithetic variables. What is the percent reduction in variance of $\frac{X+X'}{2}$ compared with $\frac{X_1+X_2}{2}$ for independent $X_1,X_2$?

idea:

We can calculate the cumulative distribution function of this function $F(x)=1-e^{-\frac{x^2}{2\sigma^2}}$.Because $x\ge0$,so we need use the inverse distribution.It is easy to get the inverse distribution $X=F^-(U)=\sqrt{-2\sigma^2\ln(1-U)},U\sim U(0,1)$.Then we use the antithetic variables to generate samples.g1 represents $\frac{X+X'}{2}$,g2 represents $\frac{X_1+X_2}{2}$ .

answer:

f1=function(n,sigma,p){
  x1=runif(n)
  if(p==1){
  x2=1-x1
  }
  else{
    x2=runif(n)
  }
  y1=sqrt(-2*sigma^2*log(1-x1))
  y2=sqrt(-2*sigma^2*log(1-x2))
  z=cbind(y1,y2)
  y=rowMeans(z)
  return(y)
}

n=1000
g1=f1(n,2,1)
g2=f1(n,2,0)
hist(g1)
z=c(sd(g1),sd(g2),sd(g1)/sd(g2))
names(z)=c("sd(g1)","sd(g2)","sd(g1)/sd(g2)")
z

R code

f1=function(n,sigma){
  x1=runif(n/2)
  x2=1-x1
  x=c(x1,x2)
  y=sqrt(-2*sigma^2*log(1-x))
  return(y)
}
f2=function(n,sigma){
  x=runif(n)
  y=sqrt(-2*sigma^2*log(1-x))
  return(y)
}
n=10000
g1=f1(n,2)
g2=f2(n,2)
hist(g1)
qqplot(g1,g2)
z=c(sd(g1),sd(g2),sd(g1)/sd(g2))
names(z)=c("sd(g1)","sd(g2)","sd(g1)/sd(g2)")
z

question 5.13

Find two importance functions $f1$ and $f2$ that are supported on $(1, ∞)$ and are ‘close’ to $g(x) = \frac{x^2}{\sqrt{2\pi}} e^{-{x^2}/2}, x > 1$. Which of your two importance functions should produce the smaller variance in estimating $\int_1^\infty\frac{x^2}{\sqrt{2\pi}} e^{-{x^2}/2}\,dx$ by importance sampling? Explain.

idea:

We can define $f_{1}(x)=e^{-(x-1)}$ and $f_{2}(x)=1/x^2$ as two importance functions.So $f_{1}(x)\sim exp(1)$,$f_{2}(x)\sim N(1,1)$.θ can be estimated with $\hat{\theta }=\frac{1}{m}\sum_{i=1}^{m}\frac{g(X_{i})}{f(x_{i})}$.

answer:

n=1000
f=function(x){
  x^2/sqrt(2*pi)*exp(-(x^2)/2)
}
f1=function(x){
  exp(-x+1)
}
f2=function(x){
   1/x^2
}
x=seq(1,5,0.1)
plot(x,f(x),col=1,type="o",ylim=c(0,1),lty=1,ylab="y")
points(x,f1(x),col=2,type="o",lty=1)
lines(x,f2(x),col=3,type="o",lty=1)
legend("topright",inset=.05,legend=c("f(x)","f1(x)","f2(x)"),lty=1,col=1:3,horiz=FALSE)
  X1=rexp(n,rate=1)+1
  theta1=mean(f(X1)/f1(X1))
  sd1=sd(f(X1)/f1(X1))/n
  X2=runif(n)
  X2<-1/(1-X2)
  theta2=mean(f(X2)/f2(X2))
  sd2=sd(f(X2)/f2(X2))/n
  a=c(theta1,sd1,theta2,sd2)
  a=matrix(a,nrow=2)
  colnames(a)=c("f1","f2")
  rownames(a)=c("theta","sd")
  a
  print("s1<s2,so f1 is better than f2")

R code

n=1000
f=function(x){
  x^2/sqrt(2*pi)*exp(-(x^2)/2)
}
f1=function(x){
  exp(-x+1)
}
f2=function(x){
   1/x^2
}
x=seq(1,5,0.1)
plot(x,f(x),col=1,type="o",ylim=c(0,1),lty=1,ylab="y")
points(x,f1(x),col=2,type="o",lty=1)
lines(x,f2(x),col=3,type="o",lty=1)
legend("topright",inset=.05,legend=c("f(x)","f1(x)","f2(x)"),lty=1,col=1:3,horiz=FALSE)
  X1=rexp(n,rate=1)+1
  theta1=mean(f(X1)/f1(X1))
  sd1=sd(f(X1)/f1(X1))/n
  X2=runif(n)
  X2<-1/(1-X2)
  theta2=mean(f(X2)/f2(X2))
  sd2=sd(f(X2)/f2(X2))/n
  a=c(theta1,sd1,theta2,sd2)
  a=matrix(a,nrow=2)
  colnames(a)=c("f1","f2")
  rownames(a)=c("theta","sd")
  a

question 5.14

Obtain a Monte Carlo estimate of $\int_1^\infty\frac{x^2}{\sqrt{2\pi}} e^{-{x^2}/2}\,dx$ by importance sampling.

idea:

We can use the importance function $f1$ to obtain a Monte Carlo estimate.

answer

n=1000
f=function(x){
  x^2/sqrt(2*pi)*exp(-(x^2)/2)
}
f1=function(x){
  exp(-x+1)
}
  X1=rexp(n,rate=1)+1
  theta1=mean(f(X1)/f1(X1))
  sd1=sd(f(X1)/f1(X1))/n
  a=c(theta1,sd1)
  a=matrix(a,ncol=1)
  colnames(a)=c("f1")
  rownames(a)=c("theta","sd")
  a

R code

n=1000
f=function(x){
  x^2/sqrt(2*pi)*exp(-(x^2)/2)
}
f1=function(x){
  exp(-x+1)
}
  X1=rexp(n,rate=1)+1
  theta1=mean(f(X1)/f1(X1))
  sd1=sd(f(X1)/f1(X1))/n
  a=c(theta1,sd1)
  a=matrix(a,ncol=1)
  colnames(a)=c("f1")
  rownames(a)=c("theta","sd")
  a

title: "Homework-2018.10.12" author: "By 18085" output: html_document: default


## Question Exercises 6.9, 6.10, and 6.B

Answer

question 6.9

Let $X$ be a non-negative random variable with $\mu=E[x]<\infty$. For a random sample $x_1,\ldots,x_n$ from the distribution of $X$, the Gini ratio is defined by $$G=\frac1{2n^2\mu} \sum_{j=1}^n\sum_{i=1}^n |x_i-x_j|.$$ The Gini ratio is applied in economics to measure inequality in income distribution (see e.g. [163]). Note that G can be written in terms of the order statistics $x_{(i)}$ as $$G=\frac1{n^2\mu}\sum_{i=1}^n (2i-n-1)x_{(i)}.$$ If the mean is unknown,let $\hat{G}$ be the statistic $G$ with $\mu$ replaced by $\bar{x}$. Estimate by simulation the mean, median and deciles of $\hat{G}$ if $X$ is standard lognormal. Repeat the procedure for the uniform distribution and Bernoulli(0.1). Also construct density histograms of the replicates in each case.

idea:

First,we can constract a function to estimate if there comes a sample.Then we can estimate if X is standard lognormal. Repeat the procedure for the uniform distribution and Bernoulli(0.1).At last,we plot the image.

answer:

f<-function(x){
  n<-length(x)
  a<-seq(1-n,n-1,2)
  i<-sort(x)
  y<-sum(a*i)/(n*n*mean(x))
  return(y)
}
n=500
m<-1000
t1<-numeric(m)
for(i in 1:m){
  q<-rnorm(n)
  p<-exp(q)
  t1[i]<-f(p)
}
result1<-c(mean(t1),quantile(t1,probs=c(0.5,0.1)))
names(result1)<-c("mean","median","deciles")
print(result1)
hist(t1)
t2<-numeric(m)
for(i in 1:m){
  x<-runif(n) # then x is uniform
  t2[i]<-f(x)
}
result2<-c(mean(t2),quantile(t2,probs=c(0.5,0.1)))
names(result2)<-c("mean","median","deciles")
print(result2)
hist(t2)
t3<-numeric(m)
for(i in 1:m){
  x<-rbinom(n,1,0.1) # then x is Bernoulli(0.1)
  t3[i]<-f(x)
}
result3<-c(mean(t3),quantile(t3,probs=c(0.5,0.1)))
names(result3)<-c("mean","median","deciles")
print(result3)
hist(t3)

R code

f<-function(x){
  n<-length(x)
  a<-seq(1-n,n-1,2)
  i<-sort(x)
  y<-sum(a*i)/(n*n*mean(x))
  return(y)
}
n=500
m<-1000
t1<-numeric(m)
for(i in 1:m){
  q<-rnorm(n)
  p<-exp(q)
  t1[i]<-f(p)
}
result1<-c(mean(t1),quantile(t1,probs=c(0.5,0.1)))
names(result1)<-c("mean","median","deciles")
print(result1)
hist(t1)
t2<-numeric(m)
for(i in 1:m){
  x<-runif(n) # then x is uniform
  t2[i]<-f(x)
}
result2<-c(mean(t2),quantile(t2,probs=c(0.5,0.1)))
names(result2)<-c("mean","median","deciles")
print(result2)
hist(t2)
t3<-numeric(m)
for(i in 1:m){
  x<-rbinom(n,1,0.1) # then x is Bernoulli(0.1)
  t3[i]<-f(x)
}
result3<-c(mean(t3),quantile(t3,probs=c(0.5,0.1)))
names(result3)<-c("mean","median","deciles")
print(result3)
hist(t3)

question 6.10

Construct an approximate 95% confidence intervalfor the Gini ratio $\gamma=E[G]$ if $X$ is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.

idea:

Because X is lognormal with unknown parameters.So we can use $\hat\gamma=\bar{G},\hat\sigma_G=\frac{1}{m-1}\sum_{i=1}^{m}(G_i-\bar{G})^2$ .Then can get the approximate 95% confidence intervalfor.

answer:

n=500
m=1000
G1<-function(n){
  y<-rnorm(n,1,1)
  x<-exp(y)
  G.sample<-f(x)
  return(G.sample)
}
G.sp<-numeric(m)
for(i in 1:m){
  G.sp[i]<-G1(n)
}
CI<-c(mean(G.sp)-sd(G.sp)*qt(0.975,m-1),mean(G.sp)+sd(G.sp)*qt(0.975,m-1))
print(CI)
cover.rate<-sum(I(G.sp>CI[1]&G.sp<CI[2]))/m
print(cover.rate)

R code

n=500
m=1000
G1<-function(n){
  y<-rnorm(n,1,1)
  x<-exp(y)
  G.sample<-f(x)
  return(G.sample)
}
G.sp<-numeric(m)
for(i in 1:m){
  G.sp[i]<-G1(n,1,1)
}
CI<-c(mean(G.sp)-sd(G.sp)*qt(0.975,m-1),mean(G.sp)+sd(G.sp)*qt(0.975,m-1))
print(CI)
cover.rate<-sum(I(G.sp>CI[1]&G.sp<CI[2]))/m
print(cover.rate)

question 6.B

Tests for association based on Pearson product moment correlation $\rho$, Spearman’s rank correlation coe???cient $\rho_{s}$ or Kendall’s coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_s$ or$\tau$ are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X,Y) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.

idea

Because the sampled distribution is bivariate normal.So we can constract a bivariate normal distribution.For example,so the correlation of X and Y are 1,the mean of X and Y are 0.Then we use this distirbution to estimate.

answer

library(mvtnorm)
power.test<-function(method,cor){
  n1<-500
  n2<-1e4
  mean<-c(0,0)
  sigma<-matrix(c(1,cor,cor,1),nrow=2)
  p<-mean(replicate(n2,expr={
    x<-rmvnorm(n1,mean,sigma)
    cor.test(x[,1],x[,2],method=method)$p.value<=0.05
  }))
  p
}
power<-data.frame(method=c("pearson","kendall","spearman"),power=c(power.test("pearson",0.1),power.test("kendall",0.1),power.test("spearman",0.1)))
knitr::kable(power,format="markdown",align="c")

R code

library(mvtnorm)
power.test<-function(method,cor){
  n1<-500
  n2<-1e4
  mean<-c(0,0)
  sigma<-matrix(c(1,cor,cor,1),nrow=2)
  p<-mean(replicate(n2,expr={
    x<-rmvnorm(n1,mean,sigma)
    cor.test(x[,1],x[,2],method=method)$p.value<=0.05
  }))
  p
}
power<-data.frame(method=c("pearson","kendall","spearman"),power=c(power.test("pearson",0.1),power.test("kendall",0.1),power.test("spearman",0.1)))
knitr::kable(power,format="markdown",align="c")

title: "Homework-2018.11.02" author: "By 18085" output: html_document: default


## Question Exercises 7.1, 7.5, 7.8, and 7.11

Answer

question 7.1

Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.

idea:

First,we use a matrix to read the data.As a function of x1,...,xn, the estimate $\hat{\theta }$ can be written as $\hat{\theta }\left ( x_{1},x_{2},...,x_{n} \right )$.An unbiased estimate of the bias $E\left ( \hat{\theta } \right )-\theta {0}$ is $\left ( n-1 \right )\left (\bar{\hat{\theta }} {\left ( . \right )}-\hat{\theta } \right )$.An unbiased estimate of $se\left ( \hat{\theta } \right )$ is $\sqrt{\frac{n-1}{n}\sum_{i-1}^{n}(\hat{\theta {(i)}}-\bar{\hat{\theta {(\cdot )}}})^{2}}$.Then,we can bring data into the formula.

answer:

c1=c(576,635,558,578,666,580,555,661,651,605,653,575,545,572,594)
c2=c(339,330,281,303,344,307,300,343,336,313,312,274,276,288,296)
x=matrix(c(c1,c2),ncol=2)
b.cor <- function(x,i) cor(x[i,1],x[i,2])
n <- 15
theta.hat <- b.cor(x,1:n)
theta.jack <- numeric(n)
for(i in 1:n){
  theta.jack[i] <- b.cor(x,(1:n)[-i])
}
bias.jack <- (n-1)*(mean(theta.jack)-theta.hat)
se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2))
round(c(original=theta.hat,bias=bias.jack, se=se.jack),3)

R code

c1=c(576,635,558,578,666,580,555,661,651,605,653,575,545,572,594)
c2=c(339,330,281,303,344,307,300,343,336,313,312,274,276,288,296)
x=matrix(c(c1,c2),ncol=2)
b.cor <- function(x,i) cor(x[i,1],x[i,2])
n <- 15
theta.hat <- b.cor(x,1:n)
theta.jack <- numeric(n)
for(i in 1:n){
  theta.jack[i] <- b.cor(x,(1:n)[-i])
}
bias.jack <- (n-1)*(mean(theta.jack)-theta.hat)
se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2))
round(c(original=theta.hat,bias=bias.jack, se=se.jack),3)

question 7.5

7.5 Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures $1/ \lambda$ by the standard normal, basic, percentile, and BCa methods. Compare the intervals and explain why they may differ.

idea:

we can use the function boot to get the 95% bootstrap confidence intervals.we can find the basic intervals is the samllest,the BCa is the biggest.

answer:

library(boot)
aircondit
b.mean=function(x,i)mean(x[i])
mean.boot=boot(data=aircondit$hours,statistic=b.mean,R=1000)
mean.boot
CI=boot.ci(mean.boot,type=c("norm","basic","perc","bca"))
CI

R code

library(boot)
aircondit
b.mean=function(x,i)mean(x[i])
mean.boot=boot(data=aircondit$hours,statistic=b.mean,R=1000)
mean.boot
CI=boot.ci(mean.boot,type=c("norm","basic","perc","bca"))
CI

question 7.8

Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat{\theta}$.

idea:

First,we get the cov of the scor.then we get the cov of every row.We can use the formula to get the bias.jack and se.jack.

answer:

library(bootstrap)
scor
n=nrow(scor)
sigma.hat=cov(scor)*(n-1)/n
eigenvalues.hat=eigen(sigma.hat)$values
theta.hat=eigenvalues.hat[1]/sum(eigenvalues.hat)
theta.jack=numeric(n)
for(i in 1:n){
  sigma.jack=cov(scor[-i,])*(n-1)/n
  eigenvalues.jack=eigen(sigma.jack)$values
  theta.jack[i]=eigenvalues.jack[1]/sum(eigenvalues.jack)
}
bias.jack=(n-1)*(mean(theta.jack)-theta.hat)
bias.jack
se.jack=sd(theta.jack)*sqrt((n-1)^2/n)
se.jack

R code

library(bootstrap)
scor
n=nrow(scor)
sigma.hat=cov(scor)*(n-1)/n
eigenvalues.hat=eigen(sigma.hat)$values
theta.hat=eigenvalues.hat[1]/sum(eigenvalues.hat)
theta.jack=numeric(n)
for(i in 1:n){
  sigma.jack=cov(scor[-i,])*(n-1)/n
  eigenvalues.jack=eigen(sigma.jack)$values
  theta.jack[i]=eigenvalues.jack[1]/sum(eigenvalues.jack)
}
bias.jack=(n-1)*(mean(theta.jack)-theta.hat)
bias.jack
se.jack=sd(theta.jack)*sqrt((n-1)^2/n)
se.jack

question 7.11

In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.

idea:

leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.Then we can get Model 2, the quadratic model, would be the best fit for the data.

answer:

library(DAAG)
attach(ironslag)
n <- length(magnetic)
e1 <- e2 <- e3 <- e4 <- cbind(rep(0,n*(n-1)/2),rep(0,n*(n-1)/2))
k=1
for(i in 1:(n-1)){
  for(j in (i+1):n ){
    y <- magnetic[-c(i,j)]
    x <- chemical[-c(i,j)]
    J1 <- lm(y ~ x)
    yhat11 <- J1$coef[1] + J1$coef[2] * chemical[i]
    yhat12 <- J1$coef[1] + J1$coef[2] * chemical[j]
    e1[k,1]=yhat11-magnetic[i]
    e1[k,2]=yhat12-magnetic[j]

    J2 <- lm(y ~ x + I(x^2))
    yhat21 <- J2$coef[1] + J2$coef[2] * chemical[i] +J2$coef[3] * chemical[i]^2
    yhat22 <- J2$coef[1] + J2$coef[2] * chemical[j] +J2$coef[3] * chemical[j]^2
    e2[k,1]=yhat21-magnetic[i]
    e2[k,2]=yhat22-magnetic[j]
    J3 <- lm(log(y) ~ x)
    logyhat31 <- J3$coef[1] + J3$coef[2] * chemical[i]
    yhat31 <- exp(logyhat31)
    logyhat32 <- J3$coef[1] + J3$coef[2] * chemical[j]
    yhat32 <- exp(logyhat32)
    e3[k,1] <-  yhat31-magnetic[i]
    e3[k,2] <-  yhat32-magnetic[j]
    J4 <- lm(log(y) ~ log(x))
    logyhat41 <- J4$coef[1] + J4$coef[2] * log(chemical[i])
    logyhat42 <- J4$coef[1] + J4$coef[2] * log(chemical[j])
    yhat41 <- exp(logyhat41)
    yhat42 <- exp(logyhat42)
    e4[k,1] <-  yhat41 -magnetic[i]
    e4[k,1] <-  yhat42 -magnetic[j]
    k=k+1
  }
}

c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))

R code

library(DAAG)
attach(ironslag)
n <- length(magnetic)
e1 <- e2 <- e3 <- e4 <- cbind(rep(0,n*(n-1)/2),rep(0,n*(n-1)/2))
k=1
for(i in 1:(n-1)){
  for(j in (i+1):n ){
    y <- magnetic[-c(i,j)]
    x <- chemical[-c(i,j)]
    J1 <- lm(y ~ x)
    yhat11 <- J1$coef[1] + J1$coef[2] * chemical[i]
    yhat12 <- J1$coef[1] + J1$coef[2] * chemical[j]
    e1[k,1]=yhat11-magnetic[i]
    e1[k,2]=yhat12-magnetic[j]

    J2 <- lm(y ~ x + I(x^2))
    yhat21 <- J2$coef[1] + J2$coef[2] * chemical[i] +J2$coef[3] * chemical[i]^2
    yhat22 <- J2$coef[1] + J2$coef[2] * chemical[j] +J2$coef[3] * chemical[j]^2
    e2[k,1]=yhat21-magnetic[i]
    e2[k,2]=yhat22-magnetic[j]
    J3 <- lm(log(y) ~ x)
    logyhat31 <- J3$coef[1] + J3$coef[2] * chemical[i]
    yhat31 <- exp(logyhat31)
    logyhat32 <- J3$coef[1] + J3$coef[2] * chemical[j]
    yhat32 <- exp(logyhat32)
    e3[k,1] <-  yhat31-magnetic[i]
    e3[k,2] <-  yhat32-magnetic[j]
    J4 <- lm(log(y) ~ log(x))
    logyhat41 <- J4$coef[1] + J4$coef[2] * log(chemical[i])
    logyhat42 <- J4$coef[1] + J4$coef[2] * log(chemical[j])
    yhat41 <- exp(logyhat41)
    yhat42 <- exp(logyhat42)
    e4[k,1] <-  yhat41 -magnetic[i]
    e4[k,1] <-  yhat42 -magnetic[j]
    k=k+1
  }
}

c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))

title: "Homework-2018.11.16" author: "By 18085" output: html_document: default


Question

Exercises 8.1 Exercises 9.3 Exercises 9.6

Answer

question 8.1

Implement the two-sample Cramer-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.

idea:

First,we use to function to achieve the two-sample Cramer-von Mises test for equal distributions.Then we use the function the test the data.

answer:

f <- function(x1,x2){
  Fx1<-ecdf(x1)
  Fx2<-ecdf(x2)
  n<-length(x1)
  m<-length(x2)
  w1<-sum((Fx1(x1)-Fx2(x1))^2)+sum((Fx1(x2)-Fx2(x2))^2)
  w2<-w1*m*n/((m+n)^2)
  return(w2)
}
attach(chickwts)
x <- sort(as.vector(weight[feed == "soybean"]))
y <- sort(as.vector(weight[feed == "linseed"]))
detach(chickwts)
t=f(x,y)
m=c(x,y)
z <- numeric(1000)
m1=length(m)
m2=length(x)
q=1:m1
for (i in 1:1000) {
  d= sample(q, size = m2, replace = FALSE)
  q1 <- m[d]
  q2 <- m[-d]
  z[i] <- f(q1,q2)
}
p <- mean(abs(c(t, z)) >= abs(t))
p
hist(z, main = "", freq = FALSE, xlab = "Cramer-von Mises statistic", breaks = "scott")

points(t, 0)

R code

f <- function(x1,x2){
  Fx1<-ecdf(x1)
  Fx2<-ecdf(x2)
  n<-length(x1)
  m<-length(x2)
  w1<-sum((Fx1(x1)-Fx2(x1))^2)+sum((Fx1(x2)-Fx2(x2))^2)
  w2<-w1*m*n/((m+n)^2)
  return(w2)
}
attach(chickwts)
x <- sort(as.vector(weight[feed == "soybean"]))
y <- sort(as.vector(weight[feed == "linseed"]))
detach(chickwts)
t=f(x,y)
m=c(x,y)
z <- numeric(1000)
m1=length(m)
m2=length(x)
q=1:m1
for (i in 1:1000) {
  d= sample(q, size = m2, replace = FALSE)
  q1 <- m[d]
  q2 <- m[-d]
  z[i] <- f(q1,q2)
}
p <- mean(abs(c(t, z)) >= abs(t))
p
hist(z, main = "", freq = FALSE, xlab = "Cramer-von Mises statistic", breaks = "scott")
points(t, 0)

question 9.3

Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchydistribution(see qcauchy or qt with df=1). Recall that a $Cauchy(\theta,\eta)$ distribution has density function $$f(x)=\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)}, -\infty0$$. The standard Cauchy has the $Cauchy(\theta=1,\eta=0)$ density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)

idea:

First, we use the function to achieve the standard Cauchy distribution.Then we use Metropolis-Hastings sampler to generate random variables.At last,we usse the image to campare the difference.

answer:

pi<-3.1415926
f<-function(x,u,lamada){
  t=lamada/(pi*(lamada^2+(x-u)^2))
  return(t)
}
g<-function(n,sigma,x0,u,lamada){
  x<-NULL
  x[1]<-x0
  e=runif(n)
  k<-0
  for(i in 2:n){
    y<-rnorm(1,x[i-1],sigma)
    if(e[i]<=(f(y,u,lamada)/f(x[i-1],u,lamada))){
      x[i]<-y
    }
    else
    {
      x[i]<-x[i-1]
      k<-k+1
    }
  }
  return(list(x=x,k=k))
}
z=g(n=10000,sigma=0.5,x0=0,u=0,lamada=1)
p=z$k/9000
p
hist(z$x[1001:10000],freq=F)
curve(f(x,0,1),add=TRUE)

R code

pi<-3.1415926
f<-function(x,u,lamada){
  t=lamada/(pi*(lamada^2+(x-u)^2))
  return(t)
}
g<-function(n,sigma,x0,u,lamada){
  x<-NULL
  x[1]<-x0
  e=runif(n)
  k<-0
  for(i in 2:n){
    y<-rnorm(1,x[i-1],sigma)
    if(e[i]<=(f(y,u,lamada)/f(x[i-1],u,lamada))){
      x[i]<-y
    }
    else
    {
      x[i]<-x[i-1]
      k<-k+1
    }
  }
  return(list(x=x,k=k))
}
z=g(n=10000,sigma=0.5,x0=0,u=0,lamada=1)
p=z$k/9000
p
hist(z$x[1001:10000],freq=F)
curve(f(x,0,1),add=TRUE)

question 9.6

Rao [220, Sec. 5g] presented an example on genetic linkage of 197 animals in four categories (also discussed in [67, 106, 171, 266]). The group sizes are (125,18,20,34). Assume that the probabilities of the corresponding multinomial distribution are $$(\frac{1}{2}+\frac{\theta}{4}, \frac{1-\theta}{4}, \frac{1-\theta}{4},\frac{\theta}{4})$$. Estimate the posterior distribution of $\theta$ given the observed sample, using one of the methods in this chapter.

idea:

we can get the posterior distribution of $\theta$ given $(x_1,x_2,x_3,x_4)=(125,18,20,34)$.

answer:

f=function(t){
  if (t < 0 || t >= 1){
    y=0
  }
  else{
    y=(1/2+t/4)^125 * ((1-t)/4)^18 * ((1-t)/4)^20 * (t/4)^34
  }
  return(y)
}
t1=runif(1000)
t2=runif(1000,-0.25,0.25)
x <- numeric(1000)
x[1]=0.25
for (i in 2:1000) {
  z = x[i-1] + t2[i]
  k= f(z) / f(x[i-1])
  if (t1[i] <= k)
    x[i]=z
  else
    x[i]=x[i-1]
}
z=x[100:1000]
p=mean(z)
p

R code

f=function(t){
  if (t < 0 || t >= 1){
    y=0
  }
  else{
    y=(1/2+t/4)^125 * ((1-t)/4)^18 * ((1-t)/4)^20 * (t/4)^34
  }
  return(y)
}
t1=runif(1000)
t2=runif(1000,-0.25,0.25)
x <- numeric(1000)
x[1]=0.25
for (i in 2:1000) {
  z = x[i-1] + t2[i]
  k= f(z) / f(x[i-1])
  if (t1[i] <= k)
    x[i]=z
  else
    x[i]=x[i-1]
}
z=x[100:1000]
p=mean(z)
p

title: "Homework-2018.11.23" author: "By 18085" output: html_document: default


## Question Exercises 9.6 Exercises 11.4

Answer

question 9.6

Rao [220, Sec. 5g] presented an example on genetic linkage of 197 animals in four categories (also discussed in [67, 106, 171, 266]). The group sizes are (125,18,20,34). Assume that the probabilities of the corresponding multinomial distribution are $$(\frac{1}{2}+\frac{\theta}{4}, \frac{1-\theta}{4}, \frac{1-\theta}{4},\frac{\theta}{4})$$. Estimate the posterior distribution of $\theta$ given the observed sample, using one of the methods in this chapter.

answer:

```r

Gelman.Rubin <- function(psi) { psi<-as.matrix(psi) n<-ncol(psi) k<-nrow(psi) psi.means<-rowMeans(psi) B<-nvar(psi.means) psi.w<-apply(psi,1,"var") W<-mean(psi.w) v.hat<-W(n-1)/n+(B/n) r.hat<-v.hat/W return(r.hat) }

prob<-function(y,ob){ if(y < 0 || y >= 1){ return(0) }else{ return((0.5+y/4)^ob[1]((1-y)/4)^ob[2] ((1-y)/4)^ob[3]*(y/4)^ob[4]) } }

mult.chain<-function(ob,w,m,x1){ x<-numeric(m) u<-runif(m) v<-runif(m,-w,w) x[1]<-x1 for(i in 2:m) { y<-x[i-1]+v[i] if(u[i]<=prob(y,ob)/prob(x[i-1],ob)){ x[i]<-y }else{ x[i]<-x[i-1]} } return(x) }

w<-.5 ob<-c(125,18,20,34) m<-10000 burn<-1000 k<-4 x0<-c(0.5,0.9,0.1,0.75)

set.seed(12345) X<-matrix(0,nrow=k,ncol=m) for(i in 1:k){ X[i, ]<-mult.chain(ob,w,m,x0[i]) }

psi<-t(apply(X,1,cumsum)) for(i in 1:nrow(psi)){ psi[i,]<-psi[i,]/(1:ncol(psi)) }

par(mfrow=c(2,2)) for(i in 1:k) plot(psi[i,(burn+1):m],type="l",xlab=i,ylab=bquote(psi))

par(mfrow=c(1,1)) rhat<-rep(0,m) for(j in (burn+1):m) rhat[j]<-Gelman.Rubin(psi[,1:j]) plot(rhat[(burn+1):m],type="l",xlab="length",ylab="R") abline(h=1.2,lty=2)

#### R code
##```r
Gelman.Rubin <- function(psi) {
  psi<-as.matrix(psi)
  n<-ncol(psi)
  k<-nrow(psi)
  psi.means<-rowMeans(psi)
  B<-n*var(psi.means)
  psi.w<-apply(psi,1,"var")
  W<-mean(psi.w)
  v.hat<-W*(n-1)/n+(B/n)
  r.hat<-v.hat/W
  return(r.hat)
}

prob<-function(y,ob){
  if(y < 0 || y >= 1){
    return(0)
  }else{
    return((0.5+y/4)^ob[1]*((1-y)/4)^ob[2]*
             ((1-y)/4)^ob[3]*(y/4)^ob[4])
  }
}

mult.chain<-function(ob,w,m,x1){
  x<-numeric(m)
  u<-runif(m)
  v<-runif(m,-w,w)
  x[1]<-x1
  for(i in 2:m) {
    y<-x[i-1]+v[i]
    if(u[i]<=prob(y,ob)/prob(x[i-1],ob)){
      x[i]<-y
    }else{
      x[i]<-x[i-1]}
  }
  return(x)
}

w<-.5
ob<-c(125,18,20,34)
m<-10000
burn<-1000
k<-4
x0<-c(0.5,0.9,0.1,0.75)

set.seed(12345)
X<-matrix(0,nrow=k,ncol=m)
for(i in 1:k){
  X[i, ]<-mult.chain(ob,w,m,x0[i])
}

psi<-t(apply(X,1,cumsum))
for(i in 1:nrow(psi)){
  psi[i,]<-psi[i,]/(1:ncol(psi))
}

par(mfrow=c(2,2))
for(i in 1:k)
  plot(psi[i,(burn+1):m],type="l",xlab=i,ylab=bquote(psi))

par(mfrow=c(1,1))
rhat<-rep(0,m)
for(j in (burn+1):m)
  rhat[j]<-Gelman.Rubin(psi[,1:j])
plot(rhat[(burn+1):m],type="l",xlab="length",ylab="R")
abline(h=1.2,lty=2)

question 11.4

Find the intersection points A(k) in $(0, \sqrt{k})$ of the curves [ S_{k-1}(a)=p \left( t(k-1)>\sqrt{\frac{a^2(k-1)}{k-a^2}} \right) ] and [ S_{k}(a)=p \left( t(k)>\sqrt{\frac{a^2k}{k+1-a^2}} \right) ] for $k = 4:25, 100, 500, 1000$, where t(k) is a Student t random variable with k degrees of freedom. (These intersection points determine the critical values for a t-test for scale-mixture errors proposed by Szekely [260].)

idea:

We can solve $f(a)=S_{k-1}(a)-S_{k}(a)=0$, given k.

answer:

mys<-function(a,k){
  M=sqrt(a^2*k/(k+1-a^2))
  return(pt(M,df=k,lower.tail = FALSE))
}
myf<-function(a,k){
  mys(a=a,k=(k-1))-mys(a=a,k=k)
}
kc=c(4:25,100,500,1000)
A=rep(0,length(kc))
for(i in 1:length(kc)){
  A[i]=uniroot(myf,c(0.5,sqrt(kc[i]-1)),k=kc[i])$root
}
cbind(kc,A)
plot(kc,A,type="o")

R code

mys<-function(a,k){
  M=sqrt(a^2*k/(k+1-a^2))
  return(pt(M,df=k,lower.tail = FALSE))
}
myf<-function(a,k){
  mys(a=a,k=(k-1))-mys(a=a,k=k)
}
kc=c(4:25,100,500,1000)
A=rep(0,length(kc))
for(i in 1:length(kc)){
  A[i]=uniroot(myf,c(0.5,sqrt(kc[i]-1)),k=kc[i])$root
}
cbind(kc,A)
plot(kc,A,type="o")

title: "Homework-2018.11.30" author: "By 18085" output: html_document: default


Question

Exercises 11.6 Exercises 2

Answer

question 11.6

Write a function to compute the cdf of the Cauchy distribution, which has density $$\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2),-\infty0$. Compare your results to the results from the R function pcauchy. (Also see the source code in pcauchy.c.)

idea:

First,we get the pdf of the Cauchy.Then we compare the result.

answer:

f<-function(t,m,n){
1/(m*3.141592653*(1+((t-n)/m)^2))
}

pdf<-function(x,m,n,lower.tail=TRUE){
 if(lower.tail) res<-integrate(f,lower = -Inf,upper = x,rel.tol=.Machine$double.eps^0.25,m=m,n=n)
 else res<-integrate(f,lower = x,upper = Inf,rel.tol=.Machine$double.eps^0.25,m=m,n=n)
  return(res$value)
}
pdf(x=0,m = 1,n = 0)
pcauchy(0,location = 0,scale = 1)
pdf(x=3,m = 2,n =1,lower.tail = F )
pcauchy(3,location = 1,scale = 2,lower.tail = F)

R code

f<-function(t,m,n){
1/(m*3.141592653*(1+((t-n)/m)^2))
}

pdf<-function(x,m,n,lower.tail=TRUE){
 if(lower.tail) res<-integrate(f,lower = -Inf,upper = x,rel.tol=.Machine$double.eps^0.25,m=m,n=n)
 else res<-integrate(f,lower = x,upper = Inf,rel.tol=.Machine$double.eps^0.25,m=m,n=n)
  return(res$value)
}
pdf(x=0,m = 1,n = 0)
pcauchy(0,location = 0,scale = 1)
pdf(x=3,m = 2,n =1,lower.tail = F )
pcauchy(3,location = 1,scale = 2,lower.tail = F)

question 2

        dat <- rbind(Genotype=c('AA','BB','OO','AO','BO','AB'),
                     Frequency=c('p2','q2','r2','2pr','2qr','2pq',1),
                     Count=c('nAA','nBB','nOO','nAO','nBO','nAB','n'))
    knitr::kable(dat,format='markdown',caption = "Comparation of them",align = "c")

idea:

First,we can get the complete data likelihood.Then we use the EM algorithm to calculation.

answer:

library(nloptr)
f1 <- function(x,x1,n.A=28,n.B=24,nOO=41,nAB=70) {
  return(sum(x)-1)
}

f2 <- function(x,x1,n.A=28,n.B=24,nOO=41,nAB=70) {
  r1<-1-sum(x1)
  nAA<-n.A*x1[1]^2/(x1[1]^2+2*x1[1]*r1)
  nBB<-n.B*x1[2]^2/(x1[2]^2+2*x1[2]*r1)
  r<-1-sum(x)
  return(-2*nAA*log(x[1])-2*nBB*log(x[2])-2*nOO*log(r)-
           (n.A-nAA)*log(2*x[1]*r)-(n.B-nBB)*log(2*x[2]*r)-nAB*log(2*x[1]*x[2]))
}

opts <- list("algorithm"="NLOPT_LN_COBYLA",
             "xtol_rel"=1.0e-5)
mle<-c()
r<-matrix(0,1,2)
r<-rbind(r,c(0.2,0.35))
j<-2
while (sum(abs(r[j,]-r[j-1,]))>1e-5) {
res <- nloptr( x0=c(0.3,0.25),eval_f=f2,lb = c(0,0), ub = c(1,1), eval_g_ineq = f1, opts = opts,x1=r[j,],n.A=28,n.B=24,nOO=41,nAB=70 )
j<-j+1
r<-rbind(r,res$solution)
mle<-c(mle,f2(x=r[j,],x1=r[j-1,]))
}
r  
mle 

R code

library(nloptr)
f1 <- function(x,x1,n.A=28,n.B=24,nOO=41,nAB=70) {
  return(sum(x)-1)
}

f2 <- function(x,x1,n.A=28,n.B=24,nOO=41,nAB=70) {
  r1<-1-sum(x1)
  nAA<-n.A*x1[1]^2/(x1[1]^2+2*x1[1]*r1)
  nBB<-n.B*x1[2]^2/(x1[2]^2+2*x1[2]*r1)
  r<-1-sum(x)
  return(-2*nAA*log(x[1])-2*nBB*log(x[2])-2*nOO*log(r)-
           (n.A-nAA)*log(2*x[1]*r)-(n.B-nBB)*log(2*x[2]*r)-nAB*log(2*x[1]*x[2]))
}

opts <- list("algorithm"="NLOPT_LN_COBYLA",
             "xtol_rel"=1.0e-5)
mle<-c()
r<-matrix(0,1,2)
r<-rbind(r,c(0.2,0.35))
j<-2
while (sum(abs(r[j,]-r[j-1,]))>1e-5) {
res <- nloptr( x0=c(0.3,0.25),eval_f=f2,lb = c(0,0), ub = c(1,1), eval_g_ineq = f1, opts = opts,x1=r[j,],n.A=28,n.B=24,nOO=41,nAB=70 )
j<-j+1
r<-rbind(r,res$solution)
mle<-c(mle,f2(x=r[j,],x1=r[j-1,]))
}
r  
mle 

title: "Homework-2018.12.07" author: "By 18085" output: html_document: default


Question

Exercises 3,Exercises 4 ???Exercises 5 in page 204 Exercises 3 in page 213??? Exercises 6 in page 214

Answer

Exercises 3

Use both for loops and lapply() to ???t linear models to the mtcars using the formulas stored in this list:
formulas <- list( mpg ~ disp, 
mpg ~ I(1 / disp), 
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt 
)

idea:

We can use the formulas function to get the answer.

answer:

attach(mtcars)
f <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt )
x <- vector("list", length(f))
for (i in seq_along(f)) { x[[i]] <-lm(f[[i]]) }
x
lapply(f,lm)

R code

attach(mtcars)
f <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt )
x <- vector("list", length(f))
for (i in seq_along(f)) { x[[i]] <-lm(f[[i]]) }
x
lapply(f,lm)

Exercises 4

Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply(). Can you do it without an anonymous function? bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] })

answer:

bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ] })
for(i in seq_along(bootstraps)){
  print(lm(mpg~disp,data =bootstraps[[i]]))
}
lapply(bootstraps,lm,formula=mpg~disp)

R code

bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ] })
for(i in seq_along(bootstraps)){
  print(lm(mpg~disp,data =bootstraps[[i]]))
}
lapply(bootstraps,lm,formula=mpg~disp)

Exercises 5

For each model in the previous two exercises,extract $R^2$ using the function below. rsq <- function(mod) summary(mod)$r.squared

answer:

rsq <- function(mod) summary.lm(mod)$r.squared
for (i in seq_along(f)) {
 print( rsq(lm(f[[i]])))
  }
lapply(lapply(f,lm),rsq)
for(i in seq_along(bootstraps)){
  print(rsq(lm(mpg~disp,data =bootstraps[[i]])))
}
lapply(lapply(bootstraps,lm,f=mpg~disp),rsq)

R code

rsq <- function(mod) summary.lm(mod)$r.squared
for (i in seq_along(f)) {
 print( rsq(lm(f[[i]])))
  }
lapply(lapply(f,lm),rsq)
for(i in seq_along(bootstraps)){
  print(rsq(lm(mpg~disp,data =bootstraps[[i]])))
}
lapply(lapply(bootstraps,lm,f=mpg~disp),rsq)

Exercises 3

The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial. trials <- replicate( 100, t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE ) Extra challenge: get rid of the anonymous function by using [[ directly.

idea:

We can use the anonymous function to get the answer.

answer:

trials <- replicate( 100, t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE )
p_value<-function(mod) mod$p.value
sapply(trials, p_value)

R code

trials <- replicate( 100, t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE )
p_value<-function(mod) mod$p.value
sapply(trials, p_value)

Exercises 6

Implement a combination of Map() and vapply() to create an lapply() variant that iterates in parallel over all of its inputs and stores its outputs in a vector (or a matrix). What arguments should the function take?

answer:

x1=c(rep(1,3),rep(2,4))
x2=c(rep(3,4),rep(4,5))
x=list(x1,x2)
lapplyf=function(f,x){
  t=Map(f,x)
   n<-length(t[[1]])
   y=vapply(t,as.vector,numeric(n))
   return(y)
}
lapplyf(mean,x)
lapplyf(quantile,x)

}

R code

x1=c(rep(1,3),rep(2,4))
x2=c(rep(3,4),rep(4,5))
x=list(x1,x2)
lapplyf=function(f,x){
  t=Map(f,x)
   n<-length(t[[1]])
   y=vapply(t,as.vector,numeric(n))
   return(y)
}
lapplyf(mean,x)
lapplyf(quantile,x)

}

title: "Homework-2018.12.14" author: "By 18085" output: html_document: default


Question

exercise 4 exercise 5

Answer

Exercises 4

Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test() or by coding from the mathematical deffition (http://en. wikipedia.org/wiki/Pearson%27s_chi-squared_test).

answer:

library("microbenchmark")
my.chisq.test<-function(x,y){
if(!is.vector(x) && !is.vector(y))
stop("at least one of 'x' and 'y' is not a vector")
if(typeof(x)=="character" || typeof(y)=="character")
stop("at least one of 'x' and 'y' is not a numeric vector")
if(any(x<0) || anyNA(x)) 
stop("all entries of 'x' must be nonnegative and finite")
if(any(y<0) || anyNA(y)) 
stop("all entries of 'y' must be nonnegative and finite")
if((n<-sum(x))==0) 
stop("at least one entry of 'x' must be positive")
if((n<-sum(x))==0) 
stop("at least one entry of 'x' must be positive")
if(length(x)!=length(y)) 
stop("'x' and 'y' must have the same length")
DNAME<-paste(deparse(substitute(x)),"and",deparse(substitute(y)))
METHOD<-"Pearson's Chi-squared test"
x<-rbind(x,y)
nr<-as.integer(nrow(x));nc<-as.integer(ncol(x))
sr<-rowSums(x);sc<-colSums(x);n<-sum(x)
E<-outer(sr,sc,"*")/n
STATISTIC<-sum((x - E)^2/E)
names(STATISTIC)<-"X-squared"
structure(list(statistic=STATISTIC,method=METHOD,data.name=DNAME),class="htest")
}
a<-c(365,435,527);b<-c(231,389,453)    
my.chisq.test(a,b)        
chisq.test(rbind(a,b))
microbenchmark(t1=my.chisq.test(a,b),t2=chisq.test(rbind(a,b)))  

R code

library("microbenchmark")
my.chisq.test<-function(x,y){
if(!is.vector(x) && !is.vector(y))
stop("at least one of 'x' and 'y' is not a vector")
if(typeof(x)=="character" || typeof(y)=="character")
stop("at least one of 'x' and 'y' is not a numeric vector")
if(any(x<0) || anyNA(x)) 
stop("all entries of 'x' must be nonnegative and finite")
if(any(y<0) || anyNA(y)) 
stop("all entries of 'y' must be nonnegative and finite")
if((n<-sum(x))==0) 
stop("at least one entry of 'x' must be positive")
if((n<-sum(x))==0) 
stop("at least one entry of 'x' must be positive")
if(length(x)!=length(y)) 
stop("'x' and 'y' must have the same length")
DNAME<-paste(deparse(substitute(x)),"and",deparse(substitute(y)))
METHOD<-"Pearson's Chi-squared test"
x<-rbind(x,y)
nr<-as.integer(nrow(x));nc<-as.integer(ncol(x))
sr<-rowSums(x);sc<-colSums(x);n<-sum(x)
E<-outer(sr,sc,"*")/n
STATISTIC<-sum((x - E)^2/E)
names(STATISTIC)<-"X-squared"
structure(list(statistic=STATISTIC,method=METHOD,data.name=DNAME),class="htest")
}
a<-c(365,435,527);b<-c(231,389,453)    
my.chisq.test(a,b)        
chisq.test(rbind(a,b))
microbenchmark(t1=my.chisq.test(a,b),t2=chisq.test(rbind(a,b)))  

Exercises 5

Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?

answer:

my.table<-function(...,dnn = list.names(...),deparse.level = 1){
    list.names <- function(...) {
        l <- as.list(substitute(list(...)))[-1L]
        nm <- names(l)
        fixup <- if (is.null(nm)) 
            seq_along(l)
        else nm == ""
        dep <- vapply(l[fixup], function(x) switch(deparse.level + 
            1, "", if (is.symbol(x)) as.character(x) else "", 
            deparse(x, nlines = 1)[1L]), "")
        if (is.null(nm)) 
            dep
        else {
            nm[fixup] <- dep
            nm
        }
    }
    args <- list(...)
    if (!length(args)) 
        stop("nothing to tabulate")
    if (length(args) == 1L && is.list(args[[1L]])) {
        args <- args[[1L]]
        if (length(dnn) != length(args)) 
            dnn <- if (!is.null(argn <- names(args))) 
                argn
            else paste(dnn[1L], seq_along(args), sep = ".")
    }
    bin <- 0L
    lens <- NULL
    dims <- integer()
    pd <- 1L
    dn <- NULL
    for (a in args) {
        if (is.null(lens)) 
            lens <- length(a)
        else if (length(a) != lens) 
            stop("all arguments must have the same length")
        fact.a <- is.factor(a)
        if (!fact.a) {
            a0 <- a
            a <- factor(a)
        }
        ll <- levels(a)
        a <- as.integer(a)
        nl <- length(ll)
        dims <- c(dims, nl)
        dn <- c(dn, list(ll))
        bin <- bin + pd * (a - 1L)
        pd <- pd * nl
    }
    names(dn) <- dnn
    bin <- bin[!is.na(bin)]
    if (length(bin)) 
        bin <- bin + 1L
    y <- array(tabulate(bin, pd), dims, dimnames = dn)
    class(y) <- "table"
    y
}
a=b=c(1,seq(1,5))            
my.table(a,b)
table(a,b)
microbenchmark(t1=my.table(a,b),t2=table(a,b))    

R code

my.table<-function(...,dnn = list.names(...),deparse.level = 1){
    list.names <- function(...) {
        l <- as.list(substitute(list(...)))[-1L]
        nm <- names(l)
        fixup <- if (is.null(nm)) 
            seq_along(l)
        else nm == ""
        dep <- vapply(l[fixup], function(x) switch(deparse.level + 
            1, "", if (is.symbol(x)) as.character(x) else "", 
            deparse(x, nlines = 1)[1L]), "")
        if (is.null(nm)) 
            dep
        else {
            nm[fixup] <- dep
            nm
        }
    }
    args <- list(...)
    if (!length(args)) 
        stop("nothing to tabulate")
    if (length(args) == 1L && is.list(args[[1L]])) {
        args <- args[[1L]]
        if (length(dnn) != length(args)) 
            dnn <- if (!is.null(argn <- names(args))) 
                argn
            else paste(dnn[1L], seq_along(args), sep = ".")
    }
    bin <- 0L
    lens <- NULL
    dims <- integer()
    pd <- 1L
    dn <- NULL
    for (a in args) {
        if (is.null(lens)) 
            lens <- length(a)
        else if (length(a) != lens) 
            stop("all arguments must have the same length")
        fact.a <- is.factor(a)
        if (!fact.a) {
            a0 <- a
            a <- factor(a)
        }
        ll <- levels(a)
        a <- as.integer(a)
        nl <- length(ll)
        dims <- c(dims, nl)
        dn <- c(dn, list(ll))
        bin <- bin + pd * (a - 1L)
        pd <- pd * nl
    }
    names(dn) <- dnn
    bin <- bin[!is.na(bin)]
    if (length(bin)) 
        bin <- bin + 1L
    y <- array(tabulate(bin, pd), dims, dimnames = dn)
    class(y) <- "table"
    y
}
a=b=c(1,seq(1,5))            
my.table(a,b)
table(a,b)
microbenchmark(t1=my.table(a,b),t2=table(a,b)) 


th12th12/StatComp17085 documentation built on May 5, 2019, 11:08 p.m.