In fjv587/StatComp18082: Three functions for 'Statistical Computing' course

Function 1

MCestimate=function(n,x){
  u=runif(n)  #Generate iid Uniform(0,1) random numbers
  cdf=numeric(length(x))
  for(i in 1:length(x)){
    g=30*x[i]^3*u^2*(1-u*x[i])^2
    cdf[i]=mean(g)  #the estimated value of the cdf
  }
  phi=pbeta(x,3,3)  #the estimated values returned by the pbeta function
  print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function.
}
MCestimate(1000,c(1,2,3,4,5,6,7,8,9)/10)

Function 2

chisq_test=function(x, y) {
  n=sum(x) + sum(y)
  ni._x=sum(x)
  ni._y=sum(y)
  chisq.statistic=0
  for (i in seq_along(x)) {
    n.j=x[i] + y[i]
    expected_x=(ni._x*n.j)/n
    expected_y=(ni._y*n.j)/n
    chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x
    chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y
  }
  chisq.statistic
}
chisq_test(c(3,8,4,9),c(6,9,3,5))

Function 3

Jacknife.estimate=function(x,y){
  set.seed(1)
  n=length(x)
  theta.hat=cor(x,y) #the correlation estimate
  print(theta.hat)

  #compute the jackknife replicates, leave-one-out estimates
  theta.jack=numeric(n)
  for(i in 1:n){
    theta.jack[i]=cor(x[-i],y[-i])
  }
  bias=(n-1)*(mean(theta.jack)-theta.hat)
  print(bias)  #a jackknife estimate of the bias of the correlation statistic
  se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
  print(se)  #a jackknife estimate of the standard error of the correlation statistic
}
Jacknife.estimate(c(0.1,0.5,1,1.2,0.2,0.4,0.8,0.6,0.8,1.3),c(4,2.7,5.4,3.6,6.3,3.9,4.7,3.9,2.8,5.6))

Homework 1 (2018/9/14)

Question 1

creat a frame

Answer

ID=c(1,2,3,4)
age=c(25,34,28,52)
status=c("poor","improved","excelled","poor")
patientdata<-data.frame(ID,age,status)
patientdata

Question 2

Perform calculations on the matrix

Answer

m1=matrix(1,nr=2,nc=2)
m2=matrix(2,nr=2,nc=2)
m=rbind(m1,m2)  #merge matrices
m
n=cbind(m1,m2)
n

Question 3

The 2D graphics of 10 pairs of random values

Answer

x=rnorm(10)
y=rnorm(10)
plot(x,y,xlab="Ten random values",ylab="Ten other values",xlim=c(-2,2),ylim=c(-2,2),main="How to customize a plot with R")

Homework 2 (2018/9/21)

Question 1

Exercise 3.5

A discrete random variable X has probability mass function | x | 0 | 1 | 2 | 3 | 4 | |------|-----|-----|-----|-----|-----| | p(x) | 0.1 | 0.2 | 0.2 | 0.2 | 0.3 | Use the inverse transform method to generate a random sample of size 1000 from the distribution of X. Construct a relative frequency table and compare the empirical with the theoretical probabilities. Repeat using the R sample function.

Answer

Idea: Use the inverse transformation method and sample function to generate random Numbers, and then compare with the theoretical probability.And in this question,I use two methods to solve the problem.

#Solution 1
x=c(0,1,2,3,4)
p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities
cp=cumsum(p)
n=1000
r=numeric(n)
r=x[findInterval(runif(n),cp)+1] # the 1000 random numbers generated from the distribution of X
ct=as.vector(table(r))  # take the numbers of every value of X in R and form a vector
pr=ct/sum(ct)  #the empirical probabilities
s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function
st=as.vector(table(s))  # take the numbers of every value of X in R and form a vector
psample=st/sum(st)  #the empirical probabilities
comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample) 
#the comparison of the empirical,the theoretical and the sample probabilities
comparison
a=c(ct,st)
rnames=c(0,1,2,3,4)
cnames=c("ct","st")
mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function
mymatrix
barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function

#Solution 2
p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities
x=rep(1,1000) # replicates the values of 1
for(i in 1:1000){
  u=runif(1)
  if(u<0.1) x[i]=0
  else{
    if(u<0.3) x[i]=1
    else{
      if(u<0.5) x[i]=2
      else{
        if(u<0.7) x[i]=3
        else x[i]=4
      }
    }
  }
}
# x :the 1000 random numbers generated from the distribution of X
ct=as.vector(table(x)) # take the numbers of every value of X in R and form a vector
pr=ct/sum(ct) #the empirical probabilities
s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function
st=as.vector(table(s))  # take the numbers of every value of X in R and form a vector
psample=st/sum(st)  #the empirical probabilities
comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample) 
#the comparison of the empirical,the theoretical and the sample probabilities
comparison
a=c(ct,st)
rnames=c(0,1,2,3,4)
cnames=c("ct","st")
mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function
mymatrix
barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function

Question 2

Exercise 3.7

Write a function to generate a random sample of size $n$ from the $Beta(a,b)$ distribution by the acceptance-rejection method. Generate a random sample of size 1000 from the $Beta(3,2)$ distribution. Graph the histogram of the sample with the theoretical $Beta(3,2)$ density superimposed.

# beta(3,2) with pdf f(x)=12x^2(1-x),0<x<1
# g(x)=x^2,0<x<1 and c=12
n=1000
k=0 #counter for accepted
j=0 #iterations
y=numeric(n)
while(k<n){
  u=runif(1)
  j=j+1
  x=runif(1) #andom variate from g
  if(x^2*(1-x)>u){
    #we accept x
    k=k+1
    y[k]=x #deliver y=x
  }
}
j
# y:the 1000 random numbers generated from the Beta(a,b) distribution
plot(y) #the scatter plot of the 1000 random numbers generated from the Beta(a,b) distribution
hist(y,prob=TRUE,main=expression(f(x)==24*x*(1-x)-12*x^2)) #density histogram of sample
t=seq(0,1,.01)
lines(t,12*t^2*(1-t)) #density curve f(x)

Question 3

Exercise 3.12

Simulate a continuous Exponential-Gamma mixture. Suppose that the rate parameter $\Lambda$ has $Gamma(r,\beta)$ distribution and $Y$ has $Exp(\Lambda)$ distribution. That is,$(Y |\Lambda = \lambda)$~$f_{Y}(y|\lambda) = \lambdae^{-\lambday}$. Generate 1000 random observations from this mixture with $r = 4$ and $\beta= 2$.

#generate a Poisson-Gamma mixture
n=1000
r=4
beta=2
lambda=rgamma(n, r, beta) #lambda is random
#now supply the sample of lambda???s as the exponential mean
y=rexp(n, lambda) #the 1000 random numbers generated from the mixture distribution
plot(y) # the scatter plot of the 1000 random numbers generated from the mixture distribution

Homework 3 (2018/9/28)

Question 1 (Exercise 5.4)

Write a function to compute a Monte Carlo estimate of the $Beta(3, 3)$ cdf, and use the function to estimate $F(x)$ for $x = 0.1,0.2,...,0.9$. Compare the estimates with the values returned by the $pbeta$ function in R.

Answer

Idea: First,the $Beta(3, 3)$ pdf is $$p(x)=30x^2(1-x)^2,0<x<1$$,so the $Beta(3, 3)$ cdf is $$F(x)=\int_{0}^{x}30t^2(1-t)^2dt$$. Making the substitution$y = t/x$, we have $dt = xdy$ and $\theta=\int_{0}^{1}30x^2y^2(1-xy)^2xdy$. Where the random variable Y has the $Uniform(0,1)$ distribution.Thus, $\theta=E_{Y}[30x^3Y^2(1-xY)^2]$.Generate iid $Uniform(0,1)$ random numbers $u_{1},u_{2},\cdots,u_{m}$ and compute $$\hat{\theta}=\overline{g_{m}(u)}=1/m\sum_{i=1}^{m}30x^3u_{i}^2(1-xu_{i})^2$$. The sample mean$\hat{\theta}$converges to $E[\hat{\theta}]=\theta$ as $m\to\infty$.

#write a function to compute aMonte Carlo estimate of the Beta(3,3)
my_MCestimate=function(n,x){
  u=runif(n)  #Generate iid Uniform(0,1) random numbers
  cdf=numeric(length(x)) 
  for(i in 1:length(x)){
    g=30*x[i]^3*u^2*(1-u*x[i])^2
    cdf[i]=mean(g)  #the estimated value of the cdf
  }
  phi=pbeta(x,3,3)  #the estimated values returned by the pbeta function
  print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function.
}
x=c(1,2,3,4,5,6,7,8,9)/10
n=1000
my_MCestimate(n,x) #the estimated values returned by the my_MCestimate function

Analysis:

From the results, the estimated value by monte carlo method is close to the estimated value from the pbeta function.This shows that we can use monte carlo method to estimate the distribution function directly.

Question 2 (Exercise 5.9)

The Rayleigh density [156, (18.76)] is$$f(x)=\frac{x}{\sigma^2}e^\frac{-x^2}{2\sigma^2},x\ge0,\sigma>0$$ Implement a function to generate samples from a $Rayleigh(\sigma)$ distribution,using antithetic variables. What is the percent reduction in variance of $\frac{X+X'}{2}$ compared with$\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$?

Answer

Idea: $f(x)=\frac{x}{\sigma^2}e^\frac{-x^2}{2\sigma^2},x\ge0,\sigma>0$,so the cdf is $$F(x)=\int_{0}^{x}\frac{t}{\sigma^2}e^-\frac{t^2}{2\sigma^2},t\geq0,\sigma>0$$.Making the substitution $u = t/x$, we have $dt = xdu$ and $$\theta=\int_{0}^{1}\frac{ux^2}{\sigma^2}e^-\frac{(xu)^2}{2\sigma^2}du$$. Where the random variable U has the $Uniform(0,1)$ distribution.Thus, $$\theta=E_{U}[\frac{Ux^2}{\sigma^2}e^-\frac{(xU)^2}{2\sigma^2}]$$. Generate random numbers $u_{1},\dots,u_{m/2}\sim uniform(0,1)$ and compute half of the replicates using $$Y_{j}=g^{(j)}(u)=\frac{u_{j}x^2}{\sigma^2}e^-\frac{(xu_{j})^2}{2\sigma^2},j=1,2,\dots,m/2$$; $$Y'{j}=\frac{(1-u{j})x^2}{\sigma^2}e^-\frac{(x(1-u_{j})^2}{2\sigma^2},j=1,2,\dots,m/2$$. Thus,$$\hat{\theta}=\overline{g_{m}(u)}=\frac{1}{m}\sum_{j=1}^{m/2}(\frac{u_{j}x^2}{\sigma^2}e^-\frac{(xu_{j})^2}{2\sigma^2}+\frac{(1-u_{j})x^2}{\sigma^2}e^-\frac{(x(1-u_{j})^2}{2\sigma^2})$$.The sample mean$\hat{\theta}$converges to $E[\hat{\theta}]=\theta$ as $m\to\infty$.

#Implement a function to generate samples from a $Rayleigh(\sigma)$ distribution,using antithetic variables.
MC.av=function(x, m= 10000, antithetic = TRUE) {
u=runif(m/2) #Generate random numbers u(1),u(2),...,u(m/2)~Unifrom(0,1)
if (!antithetic) v=runif(m/2) else
v=1-u
u=c(u, v)
sigma=10
cdf=numeric(length(x))
for (i in 1:length(x)) {
g=u*x[i]^2/(sigma^2)* exp(-(u * x[i])^2/2*(sigma)^2)
cdf[i]=mean(g)  #the estimated value of the cdf
}
cdf
}
x=seq(1.0,5.0,length=10)
set.seed(123)
MC1=MC.av(x,anti=FALSE) ##generate samples from the distribution without using antithetic variables
set.seed(123)
MC2=MC.av(x) #generate samples from the distribution by using antithetic variables
print(round(rbind(x,MC1,MC2),10))
c(sd(MC1),sd(MC2))
print((var(MC1)-var(MC2))/var(MC1)) #the percent reduction in variance  of $\frac{X+X'}{2}$  compared with$\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$

Analysis:

From the results,we can conclude that the percent reduction in variance of $\frac{X+X'}{2}$ compared with $\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$ is 95.9%.

Question 3 (Exercise 5.13)

Find two importance functions $f_{1}$ and $f_{2}$ that are supported on $(1,\infty)$ and are 'close' to $$g(x)=\frac{x^2}{\sqrt{2\pi}}e^-x^2/2,x>1$$. Which of your two importance functions should produce the smaller variance in estimating $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^\frac{-x^2}{2}dx$$ by importance sampling? Explain.

Answer

Idea: $g(x)=\frac{x^2}{\sqrt{2\pi}}e^-x^2/2,x>1$,two possible choices of important function to estimate $\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^-x^2/2dx$ by sampling method are compared.The two important functions are $$f1=\frac{1}{\sqrt{2\pi}}e^-\frac{x^2}{2}$$ and $$f2=\frac{1}{2}x^2e^-x$$.

m=10000
theta.hat=se=numeric(2)
g=function(x) {
((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1)
}
x=rnorm(m) #using f1
fg1=g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2))
theta.hat[1]=mean(fg1) #The first estimate
se[1]=sd(fg1) #The variance of the first estimate
x=rgamma(m,3,1) #using f2
x=x[x>1] #Limit the range of x,let x>1
fg2=g(x)/((1/2)*(x^2)*exp(-x))
theta.hat[2]=mean(fg2) #The second estimate
se[2]=sd(fg2) #The variance of the second estimate
rbind(theta.hat, se)

#Draw the curves of g, f1 and f2, then compare them
curve(g,1,5,ylab="",main="The comparsion of the curves of g,f1 and f2")
f1=function(x){
  (1/sqrt(2*pi))*exp(-(x^2)/2)
}
curve(f1,1,5,add=TRUE,col="red")
f2=function(x){
  (1/2)*(x^2)*exp(-x)
}
curve(f2,1,5,add=TRUE,col="green")

#Draw the curves of fg1 and fg2, and compare them
fg1=function(x){
  g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2))
}
curve(fg1,1,5,ylab="",main="The comparsion of the curves of fg1 and fg2")
fg2=function(x){
  g(x)/((1/2)*(x^2)*exp(-x))
}
curve(fg2,1,5,add=TRUE,col="red")

Analysis:

From the results,we can conclude that the variance of $f1=\frac{1}{\sqrt{2\pi}}e^-\frac{x^2}{2}$ produced is 1.1691 and the variance of $f2=\frac{1}{2}x^2e^-x$ produced is 0.4635,so the $f2=\frac{1}{2}x^2e^-x$ produce the smaller variance in esmating the integration.From the graphs,we can conclude that the function that corresponds to the most nearly constant ratio $g(x)/f(x)$ appears to be f2.

Question 4 (Exercise 5.14)

Obtain a Monte Carlo estimate of $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^\frac{-x^2}{2}dx$$ by importance sampling.

Answer

Idea: From the question 3(exercise 5.13),we know that the $f2=\frac{1}{2}x^2e^-x$ is the more appropriate important function.So,in this question,we can use f2 to estimate tht integration.

m=10000
theta.hat=se=numeric(1)
g=function(x) {
((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1)
}
x=rgamma(m,3,1) #using f
x=x[x>1] #Limit the range of x,let x>1
fg=g(x)/((1/2)*(x^2)*exp(-x))
theta.hat[1]=mean(fg) #The second estimate
se[1]=sd(fg) #The variance of the second estimate
rbind(theta.hat, se)

Analysis:

From the results,we can conclude that the estimate is 0.4313,and I have computed the real value is 0.4006.This result shows that the estimated effect of f2 is good.

Homework 4 (2018/10/12)

Question 1 (Exercise 6.9)

Let $X$ be a non-negative random variable with $\mu=E[X]<\infty$. For a random sample $x_{1},\dots,x_{n}$ from the distribution of $X$, the Gini ratio is defined by $$G=\frac{1}{2n^2\mu}\sum_{j=1}^{n}\sum_{i=1}^{n}|x_{i}-x_{j}|.$$

The Gini ratio is applied in economics to measure inequality in income dis- tribution (see e.g. [163]). Note that $G$ can be written in terms of the order statistics $x_{(i)}$ as $$\hat{G}=\frac{1}{n^2\mu}\sum_{i=1}^{n}(2i-n-1)x_{(i)}.$$

If the mean is unknown, let $\hat{G}$ be the statistic G with $\mu$ replaced by $\overline{x}$. Estimate by simulation the mean, median and deciles of $\hat{G}$ if $X$ is standard lognormal. Repeat the procedure for the uniform distribution and $Bernoulli(0.1)$. Also construct density histograms of the replicates in each case.

Anwser

Idea:

Generate the replicates $\hat{G}^{(i)},i=1,\dots,n$ when $x$ is standard lognormal,the uniform distribution or $Bernoulli(0.1)$ by repeating:

(1)Generate $x^{(i)}{1},\dots,x^{(i)}{n}$ iid from the distribution of x.

(2)sort $x^{(i)}{1},\dots,x^{(i)}{n}$ in increaing order,to obtain $x^{(i)}{(1)}\leq\dots\leq x^{(i)}{(n)}$.

(3)compute $\hat{G}^{(i)}=\frac{1}{n^2\mu}\sum_{j=1}^{n}(2j-n-1)x^{(i)}_{(j)}$.

Estimate by simulation the mean, median and deciles of $\hat{G}$ if $X$ is standard lognormal,the uniform distribution and $Bernoulli(0.1)$. And construct density histograms of the replicates in each case.

#the number of test n=1000
#estimate the G
n=1000
G.hat=matrix(0,nrow=3,ncol=n)
mean.G.hat=numeric(3)
median.G.hat=numeric(3)
deciles.G.hat=numeric(3)
x=matrix(0,nrow=3,ncol=n)
mu.hat=matrix(0,nrow=3,ncol=n)
t=numeric(n)
for(k in 1:3){
  for(i in 1:n){ 
    # i reprent the i-th test
    x[1,1:n]=sort(rlnorm(n)) #generated the random number of x1,...,xn from the standard lognormal distribution,and sort them
    x[2,1:n]=sort(runif(n)) #generated the random number of x1,...,xn from the uniform distribution,and sort them
    x[3,1:n]=sort(rbinom(n,1,0.1)) #generated the random number of x1,...,xn from the Bernoulli(0.1),and sort them
    mu.hat[k,i]=sum(x[k,1:n])/n  #mu replaced by the mean of x
    for(j in 1:n){
      t[j]=(2*j-n-1)*x[k,j]
    }
    G.hat[k,i]=sum(t[1:n])/(mu.hat[k,i]*(n^2)) #the estimate of G from the k-th distribution
  }
}

#Estimate the mean,median and deciles of G.hat if X is standard lognorm,uniform distribution and benoulli(0.1)
mean.G.hat=apply(G.hat,1,mean) #the mean of G.hat
median.G.hat=apply(G.hat,1,median) #the median of G.hat
deciles.G.hat[1]=quantile(G.hat[1,1:n],0.1) #the deciles of G.hat
deciles.G.hat[2]=quantile(G.hat[2,1:n],0.1)
deciles.G.hat[3]=quantile(G.hat[3,1:n],0.1) 
round(rbind(mean.G.hat,median.G.hat,deciles.G.hat),3) #output the mean,median and deciles of G.hat when x is standard lognormal,the uniform distribution and Bernoulli(0.1)

#construct the density histograms of the replicates
hist(G.hat[1,1:n],main="density histogram of G.hat when x is standard lognormal")
hist(G.hat[2,1:n],main="density histogram of G.hat when x is uniform distribution")
hist(G.hat[3,1:n],main="density histogram of G.hat when x is Bernoulli(0.1)")

Analysis:

From the results,we can know that the mean,median and deciles of G.hat is approximately equal under the same distribution of x.But when x is different distribution,the value is different too.

Question 2 (Exercise 6.10)

Construct an approximate 95% confidence interval for the Gini ratio $\gamma=E(G)$ if $X$ is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.

Answer

Idea:

(1)Firstly,assume the $\gamma \sim norm(E(\gamma),var(\gamma))$,use the method of Exercise 6.9 to estimate $E(\hat{G})$ and $Var(\hat{G})$,and let $E(\hat{G})$ and $Var(\hat{G})$ replace $E(\gamma),var(\gamma)$.

(2)construct a function to return the 95% confidence interval of gamma

(3)Then,repeat the estimation of confidence interval procedure 100 times,compute the times $m$ of the true $\gamma$ is in the confidence interval

(4)Lastly,the coverage rate $CP=m/100$

#construct a function to return the 95% confidence interval of gamma
CI=function(n,a,b,alpha=0.05){
  t=numeric(n)
  g=numeric(n)
  for(i in 1:n){
    x=sort(rlnorm(n,a,b))
    mu.hat=mean(x)
    for(j in 1:n){
      t[j]=(2*j-n-1)*x[j]
    }
    g[i]=mean(t)/(n*mu.hat)
  }
  gamma.hat=mean(g)
  sd.hat=sqrt(sum((g-mean(g))^2))/n #estimate the standard deviation of gamma.hat
  c=qnorm(1-alpha/2,0,1)*sd.hat
  CI1=gamma.hat-c #the confidence lower limit of gamma
  CI2=gamma.hat+c #the confidence upper limit of gamma
  c(CI1,CI2)
}
CI(1000,1,1,0.05) #construct an approximate 95% confidence interval of gamma when the parameters are given

#assess the coverage rate
d=matrix(0,nrow=100,ncol=2)
for(k in 1:100){
  d[k,1:2]=CI(1000,1,1,0.05)
}
CI1=d[1:100,1]
CI2=d[1:100,2]
#simulate the true value of gamma
gamma=2*pnorm(1/sqrt(2),0,1)-1  #I get this value in an established paper
gamma
m=0
for(i in 1:100){
  if(gamma<CI1[i]) m=m
  else{
    if(gamma>CI2[i]) m=m
    else{
      m=m+1
    }
  }
}
m
CP=m/100
CP #the coverage rate

n=1000
mu=0
sigma=0.9
repsig=400
repmean=100
system.time({
  G=function(x,mu=mean(x),n=length(x))sort(x)%*%(1:n*2-n-1)/n/n/mu
  mG=function(sig){mu=mean(rlnorm(repmean*n,0,sig));mean(sapply(1:repmean,function(...){G(rlnorm(n,0,sig),mu)}))}
  X=rlnorm(n,mu,sigma)
  print((CI_sig=sqrt(var(log(X))/qchisq(c(0.9875,0.0125),df=n-1)*(n-1))))
  rsig=sqrt(var(log(X))/qchisq(runif(repsig,0.0125,0.9875),df=n-1)*(n-1))
  Gs=sapply(rsig,mG)
  print(quantile(Gs,c(0.0125,0.9875)))
})

Analysis:

From the results,we can see that the coverage rate CP is 0.43 when conduct 100 times experiment.The CP is visibly smaller than the nominal level,then it is said to be liberal.

Question 3(Projects 6.B)

Tests for association based on Pearson product moment correlation $\rho$, Spearman???s rank correlation coefficient $\rho_{s}$, or Kendall???s coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_{s}$ or $\tau$ are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X,Y ) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.

Answer

library(MASS)
Sigma=matrix(c(1,2,1,9),2,2)  
A=mvrnorm(n=1000, rep(0, 2), Sigma) #the sampled distribution is bivariate normal
plot(A[1:1000,1],A[1:1000,2])
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="pearson",conf.level=0.95)
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="kendall",conf.level=0.95)
cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="spearman",conf.level=0.95)

#compute the power
power=matrix(0,nrow=20,ncol=3)
for(i in 1:20) {
pvalues <- replicate(100, expr = {
  Sigma=matrix(c(1,2,1,9),2,2)
  A=mvrnorm(n=100, rep(0, 2), Sigma)
  ctest=cor.test(A[1:100,1],A[1:100,2],alternative="two.sided",method=c("pearson","kendall","spearman"),conf.level=0.95)
  ctest$p.value 
  } )
power[i,1:3]=mean(pvalues<=0.05)
}

Homework 5 (2018/11/2)

Question 1(Exercise 7.1)

Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.

Answer

Idea:

Compute a jackknife estimate of the bias and the standard error of the corre- lation statistic in Example 7.2 as follows:

(1)Let $x=(x_{1},\dots,x_{n})$ be an observed random sample, and idefine the $i$th jackknife sample $x_{(i)}$ to be the subset of $x$ that leaves out the $i^{th}$ observation $x_{i}$. That is,$$x_{(i)}=(x_{1},\dots,x_{i},x_{i+1},\dots,x_{n})$$.

(2)$\hat{\theta}=cor(LAST,GPA)$,define the $i^{th}$ jackknife replicate $\hat{\theta}=cor(LAST[-i],GPA[-i])$.

(3)The jackknife estimate of bias is $$\hat{bias}{jack}=(n-1)(\overline{\hat{\theta{(.)}}}-\hat{\theta})$$. where $\hat{\theta_{(.)}}=\frac{1}{n}\sum_{i=1}^{n}\hat{\theta}_{i}$.

(4)A jackknife estimate of standard error is $$\hat{se}{jack}=\sqrt{\frac{n-1}{n}\sum{i=1}^{n}(\hat{\theta}{(i)}-\overline{\hat{\theta{(.)}}})^2}$$.

set.seed(1)
library(bootstrap) #for the law data
LSAT=law$LSAT
GPA=law$GPA
n=nrow(law)
system.time({
  theta.hat=cor(LSAT,GPA) #the correlation estimate
  print(theta.hat)

  #compute the jackknife replicates, leave-one-out estimates
  theta.jack=numeric(n)
  for(i in 1:n){
    theta.jack[i]=cor(LSAT[-i],GPA[-i])
  }
  bias=(n-1)*(mean(theta.jack)-theta.hat)
  print(bias)  #a jackknife estimate of the bias of the correlation statistic
  se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
  print(se)  #a jackknife estimate of the standard error of the correlation statistic
})

Analysis

The jackknife estimate of the bias and the standard error of the correlation statistic are -0.006473623 and 0.1425186.

Question 2(Exercise 7.5)

Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures $1/\lambda$ by the standard normal, basic, percentile, and BCa methods. Compare the intervals and explain why they may differ.

Answer

set.seed(1)
library(boot)   #for boot and boot.ci
n=100 #the times of bootstrap replicates
system.time({
  x=c(3,5,7,18,43,85,91,98,100,130,230,487)
  boot.mean=function(x,i) mean(x[i])
  mean.hat=mean(x)
  boot.obj=boot(x,statistic = boot.mean,R=n)
  print(boot.ci(boot.obj,type=c("norm","basic","perc","bca")))

})

Analysis

From the results,we can see that the 95% bootstrap confidence intervals for the mean time between failures by the standard normal, basic,percentile, and BCa methods respectively are $( 30.6, 187.0 ), ( 4.4, 172.2 ), ( 43.9, 211.7 ), and ( 53.9, 226.3 ).$

Because the BCa confidence intervals are transformation respecting and BCa intervals have second order accuracy, the bootstrap t confidence interval is second order accurate but not transformation respecting, the bootstrap percentile interval is transformation respecting but only first order accurate, the standard normal confidence interval is neither transformation respecting nor second order accurate, so the intervals may differ.Hence, the larger the order of accuracy, the faster the error is reduced.So the Bca method is more accurate.

Question 3 (Exercise 7.8)

Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat{\theta}$.

Answer

Idea:

(1)the MLE of covariance is $$\hat{\Sigma}=\frac{1}{n-1}(X-\mu)(X-\mu)^T,$$ where the $X=scor^T=(x_{1},\dots,x_{n})$ is the sample matrix, ,$x_{i}=(x_{i1},\dots,x_{i5})^T$,$\mu=(\mu_{1},\dots,\mu_{n}),n=1,\dots,88$, where $\mu_{i}$ is the rowmeans of $scor^T$.

(2)obtain the jackknife estimates,first and idefine the $i$th jackknife sample $x_{(i)}$ to be the subset of $x$ that leaves out the $i^{th}$ observation $x_{i}$. That is,$$Y_{i}=(x_{1},\dots,x_{-1},x_{i+1},\dots,x_{n})$$.

(3)$\hat{\Sigma}=\frac{1}{n-1}(X-\mu)(X-\mu)^T$,define the $i^{th}$ jackknife replicate $\hat{\Sigma}=\frac{1}{n-1}(Y-\mu)(Y-\mu)^T$.

(4)Then,compute the eigenvalues of Sigma.hat under $\hat{\Sigma}$,$\hat{\lambda}{1}>\dots>\hat{\lambda}{5}$,then $\hat{\theta}=\frac{\hat{\lambda}{1}}{\sum{j=1}^{j=5}\hat{\lambda}_{j}}$.

(5)Then,compute the eigenvalues of Sigma.jack under $\hat{\Sigma}{jack}$,then compute the $\hat{\theta}{jack}$.

(6)The jackknife estimate of bias is $$\hat{bias}{jack}=(n-1)(\overline{\hat{\theta{(.)}}}-\hat{\theta})$$. where $\hat{\theta_{(.)}}=\frac{1}{n}\sum_{i=1}^{n}\hat{\theta}_{jack(i)}$.

(7)A jackknife estimate of standard error is $$\hat{se}{jack}=\sqrt{\frac{n-1}{n}\sum{i=1}^{n}(\hat{\theta}{(i)}-\overline{\hat{\theta{(.)}}})^2}$$.

set.seed(1)
data(scor, package = "bootstrap") #for the score data
library(MASS)
scor.m= matrix(as.numeric(t(scor)), 5, 88)
n=nrow(scor)  #sample size
mu=rowMeans(scor.m)
mu0=matrix(mu,nrow=5,ncol=n)
system.time({
  sigma.hat=((scor.m-mu0)%*%t(scor.m-mu0))/(n-1) # the MLE of covariance matrix
  ev.hat=sort(eigen(sigma.hat)$values,decreasing=T)
  theta.hat=ev.hat[1]/sum(ev.hat) 
  theta.hat #the estimate of theta

  #obtain the jackknife estimates of bias and standard error of theta.hat
  theta.jack=numeric(n)
  for(i in 1:n){
    scor.m.jack= scor.m[,-i]  #compute the jackknife replicates, leave-one-out estimates
    mu1=matrix(mu,nrow=5,ncol=n-1)
    sigma.jack=((scor.m.jack-mu1)%*%t(scor.m.jack-mu1))/(n-1)
    ev.jack=sort(eigen(sigma.jack)$values,decreasing=T)
    theta.jack[i]=ev.jack[1]/sum(ev.jack) 
  }
  theta.jack
  bias=(n-1)*(mean(theta.jack)-theta.hat)
  print(bias)  #a jackknife estimate of the bias 
  se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2))
  print(se)  #a jackknife estimate of the standard error 

})

Analysis

The jackknife estimates of bias and standard error of $\hat{\theta}$ are 0.001044413 and 0.04897464.

Qusetion 4 (Exercise 7.11)

In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.

Answer

Idea:

(1)Leave-two-out cross-validation

Leave-two-out cross-validation involves using 2 observations as the validation set and the remaining $n-2$ observations as the training set. This is repeated on all ways to cut the original sample on a validation set of p observations and a training set. LtO cross-validation requires training and validating the model $m=c^{n}{2}$ times, where n is the number of observations in the original sample, and where $m=c^{n}{2}$ is the binomial coefficient.

(2)Let $y=magnetic,x=chemical$, For $k = 1,\dots,n$, let observation $(x_{i},y_{i}),(x_{j},y_{j})$ be the test point and use the remaining observations to fit the model.

(a) Fit the model(s) using only the $n ??? 2$ observations in the training set

(b) Compute the predicted response $$\hat{y}{i}=\hat{\beta}{0}+\hat{\beta}{1}x_{i}$$, $$\hat{y}{j}=\hat{\beta}{0}+\hat{\beta}_{1}x{j}$$ for the test point.

(3)Estimate the mean of the squared prediction errors $\hat{\sigma}^2=\overline{spe}$

set.seed(1)
library(DAAG)
attach(ironslag)
n=length(magnetic) #in DAAG ironslag
m=n*(n-1)/2  # leave-two-out cross validation require training and validating the model m times
spe1=spe2=spe3=spe4=numeric(m)
# for leave-two-out cross validation
# fit models on leave-two-out samples
k=0
for(i in 1:(n-1)){
  for(j in (i+1):n){
    k=k+1
    y=magnetic[-c(i,j)] #leave-two-out 
    x=chemical[-c(i,j)]

    J1=lm(y ~ x)
    yhat11=J1$coef[1] + J1$coef[2] * chemical[i]   #the  estimate value of the i-th value
    yhat12=J1$coef[1] + J1$coef[2] * chemical[j]    #the  estimate value of the j-th value
    spe1[k]=((magnetic[i]-yhat11)^2+(magnetic[j]-yhat12)^2)/2   #the squared prediction error of the k-th 

    J2=lm(y ~ x + I(x^2))
    yhat21=J2$coef[1] + J2$coef[2] * chemical[i] +J2$coef[3] * chemical[i]^2
    yhat22=J2$coef[1] + J2$coef[2] * chemical[j] +J2$coef[3] * chemical[j]^2
    spe2[k]=((magnetic[i]-yhat21)^2+(magnetic[j]-yhat22)^2)/2

    J3=lm(log(y) ~ x)
    logyhat31 =J3$coef[1] + J3$coef[2] * chemical[i]
    logyhat32 =J3$coef[1] + J3$coef[2] * chemical[j]
    yhat31=exp(logyhat31)
    yhat32=exp(logyhat32)
    spe3[k]=((magnetic[i]-yhat31)^2+(magnetic[j]-yhat32)^2)/2

    J4=lm(log(y) ~ log(x))
    logyhat41 =J4$coef[1] + J4$coef[2] * log(chemical[i])
    logyhat42 =J4$coef[1] + J4$coef[2] * log(chemical[j])
    yhat41=exp(logyhat41)
    yhat42=exp(logyhat42)
    spe4[k]=((magnetic[i]-yhat41)^2+(magnetic[j]-yhat42)^2)/2

  }
}
c(mean(spe1),mean(spe2),mean(spe3),mean(spe4))


#According to the prediction error criterion, Model 2, the quadratic model, would be the best fit for the data.
J2
par(mfrow = c(1, 2)) #layout for graphs
plot(J2$fit, J2$res) #residuals vs fitted values
abline(0, 0) #reference line
qqnorm(J2$res) #normal probability plot
qqline(J2$res) #reference line
par(mfrow = c(1, 1)) #restore display

Analysis

According to the prediction error criterion, Model 2, the quadratic model,would be the best fit for the data.Its results are consistent with those of the leave-one out method.From the QQ plot, we can see the Model 2 is similar to the sample distribution,

Homework 6 (2018/11/16)

Question 1 (Exercise 8.1)

Implement the two-sample Cram´ er-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.

set.seed(1)
library(nortest)
chickwts
x=sort(as.vector(chickwts$weight[chickwts$feed == "soybean"]))
y=sort(as.vector(chickwts$weight[chickwts$feed == "linseed"]))
n=length(x)
m=length(y)


R=999 #number of replicates
z=c(x, y) #pooled sample
K=1:26
D=numeric(R) #storage for replicates
fnx=ecdf(x)  #the ecdf of x
fny=ecdf(y)   #the ecdf of y
options(warn = -1)
D0=(sum((fnx(x)-fny(x))^2)+sum((fnx(y)-fny(y))^2))/((m*n)/((m+n)^2))  #statistic
for (i in 1:R){
  k=sample(K,size=n,replace=FALSE) #permutation
  x1=z[k]
  y1=z[-k]
  D[i]=(sum((fnx(x1)-fny(x1))^2)+sum((fnx(y1)-fny(y1))^2))/((m*n)/((m+n)^2))  #statistic
}

options(warn = 0)
p=mean(D>D0)
p

Analysis

From the results,p=0.2712 is significent,so we should reject the $H_{0}$.

Qusetion 2

Design experiments for evaluating the performance of the NN,energy, and ball methods in various situations.

(1)Unequal variances and equal expectations

(2)Unequal variances and unequal expectations

(3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)

(4)Unbalanced samples (say, 1 case versus 10 controls)

Note: The parameters should be choosen such that the powers are distinguishable (say, range from 0.3 to 0.9).

set.seed(1)
library(RANN)
library(energy)
library(Ball)
library(boot)


#(1) Unequal variances and equal expectations

m=40 #number of replicates
R=999
k=3



Tn= function(z, ix, sizes,k) {
  n1=sizes[1]
  n2 = sizes[2]
  n =n1 + n2
  if(is.vector(z)) z=data.frame(z,0);
  z =z[ix, ];
  NN =nn2(data=z, k=k+1) 
  block1= NN$nn.idx[1:n1,-1]
  block2=NN$nn.idx[(n1+1):n,-1]
  i1 =sum(block1 < n1 + .5); 
  i2=sum(block2 > n1+.5)
  (i1 + i2) / (k * n)
}



eqdist.nn=function(z,sizes,k){
boot.obj=boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p1.values=matrix(NA,m,3)
for(i in 1:m){
  x=rnorm(10,4,4) #generate a sample x~N(4,4)
  y=rnorm(10,4,1) #generate a sample y~n(4,1)
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p1.values[i,1]=eqdist.nn(z,N,k)$p.value
  p1.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p1.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
eqdist.nn(z,N,k)$p.value
alpha=0.1
pow1=colMeans(p1.values<alpha)
pow1
names(pow1)=c("NN", "energy", "ball")
barplot(pow1)




#(2) Unequal variances and unequal expectations

m=40  #number of replicates
R=999
k=3
z=c(x,y) # pooled sample
eqdist.nn=function(z,sizes,k){
boot.obj = boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p2.values=matrix(NA,m,3)
for(i in 1:m){
  x=rnorm(10,4,4) #generate a sample x~N(4,4)
  y=rnorm(10,2,1) #generate a sample y~N(2,1)
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p2.values[i,1]=eqdist.nn(z,N,k)$p.value
  p2.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p2.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow2=colMeans(p2.values<alpha)
pow2
names(pow2)=c("NN", "energy", "ball")
barplot(pow2)



#(3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)

m=40 #number of replicates
R=999
k=3
z=c(x,y) #poled sample
eqdist.nn=function(z,sizes,k){
boot.obj =boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p3.values=matrix(NA,m,3)
for(i in 1:m){
  x=rt(10,1)  #generate a sample x~t(1)
  y=c(rnorm(5,2,1),rnorm(5,4,4)) #generate a bimodel  sample y which is a mixture of N(2,1) and N(4,4)
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p3.values[i,1]=eqdist.nn(z,N,k)$p.value
  p3.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p3.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow3=colMeans(p3.values<alpha)
pow3
names(pow3)=c("NN", "energy", "ball")
barplot(pow3)




#(4)Unbalanced samples (say, 1 case versus 10 controls)

m=40  #number of replicates
R=999
k=3
z=c(x,y) #pooled sample
eqdist.nn =function(z,sizes,k){
boot.obj= boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts=c(boot.obj$t0,boot.obj$t)
p.value=mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p4.values=matrix(NA,m,3)
for(i in 1:m){
  x=rexp(10,4)  #generate a sample x~exp(4)
  y=rf(15,1,2)  #generate a sample y~F(1,2),and the numbers of them are different
  z=c(x,y)
  n1=length(x)
  n2=length(y)
  N=c(n1,n2)
  p4.values[i,1]=eqdist.nn(z,N,k)$p.value
  p4.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value
  p4.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha=0.1
pow4=colMeans(p4.values<alpha)
pow4
names(pow4)=c("NN", "energy", "ball")
barplot(pow4)

Homework 7 (2018/11/23)

Question 1

For exercise 9.6, use the Gelman-Rubin method to monitor convergence of the chain, and run the chain until the chain has converged approximately to the target distribution according to $\hat{R}<1.2$.

set.seed(1)
Gelman.Rubin=function(psi) {
# psi[i,j] is the statistic psi(X[i,1:j])
# for chain in i-th row of X
psi =as.matrix(psi)
n=ncol(psi)
k=nrow(psi)
psi.means=rowMeans(psi) #row means
B=n * var(psi.means) #between variance est.
psi.w=apply(psi, 1, "var") #within variances
W=mean(psi.w) #within est.
v.hat=W*(n-1)/n + (B/n) #upper variance est.
r.hat=v.hat / W #G-R statistic
return(r.hat)
}


  f=function(theta,x){
    f1=(0.5+theta/4)^x[1]
    f2=((1-theta)/4)^x[2]
    f3=((1-theta)/4)^x[3]
    f4=(theta/4)^x[4]
    if(theta>=0 && theta<1)
      return(f1*f2*f3*f4)
    else{
      return(0)
      }
  }


unif.chain=function(m,p1){
  #generates a Metropolis chain for unif(0,1)
  #with unif(-1,1) proposal distribution
  #and starting value X1
  p=rep(0, m)
  p[1]=p1
  u=runif(m)
  y=runif(m,-1,1) #the proposal distribution
  for (i in 2:m) {
    theta=p[i-1]+y[i]
    num= f(theta,x)
    den=f(p[i-1],x)
    if(u[i]<=num/den)
      p[i]=theta
    else{
      p[i]=p[i-1]
    }
    }
return(p)
}


m=5000 #the number of replicates
b=4700 #burn-in length
t=4 #number of chains to generate
x=c(125,18,20,34)
p0=c(0.2,0.4,0.6,0.8) #choose overdispersed initial values


#generate the chains
P=matrix(0,nrow=t,ncol=m)
for(i in 1:t)
  P[i,]=unif.chain(m,p0[i])

#compute diagnostic statistics
psi=t(apply(P, 1, cumsum))
for (i in 1:nrow(psi))
psi[i,]=psi[i,] / (1:ncol(psi))
print(Gelman.Rubin(psi))

Analysis

From the results, we can see that the $\hat{r}$ is $1.0595<1.2$. It shows that the chains have approximately converged to the target distribution.

Question 2(Exercise 11.4)

Find the intersection points $A(k)$ in $(0,\sqrt{k})$ of the curves $$S_{k-1}(a)=P\left(t(k-1)>\sqrt{\frac{a^{2}(k-1)}{k-a^{2}}}\right)$$ and $$S_{k}(a)=P\left(t(k)>\sqrt{\frac{a^{2}k}{k+1-a^{2}}}\right)$$,for $k=4:25,100,500,1000$, where $t(k)$ is a Student $t$ random variable with $k$ degrees of freedom. (These intersection points determine the critical values for a t-test for scale-mixture errors proposed by Sz´ekely [260].)

set.seed(1)
library(rootSolve)
k=c(4:25,100,500,1000)  #the degrees
n=length(k)
solution=numeric(n)
s=numeric(n)
for(i in 1:n){
  s1=function(a){
    q=sqrt((a^2)*(k[i]-1)/(k[i]-a^2))
    pt(q,df=k[i]-1,lower.tail = FALSE,log.p = FALSE)
  }
  s2=function(a){
    q=sqrt((a^2)*k[i]/(k[i]+1-a^2))
    pt(q,df=k[i],lower.tail = FALSE,log.p = FALSE)
  }
  f=function(a){
    s1(a)-s2(a)
  }
  solution[i]=uniroot(f,c(0.0001,sqrt(k[i])-0.0001))$root  #the abscissa of the intersection points
  s[i]=s1(solution[i])  #the ordinate of the intersection points
  }
print(round(rbind(solution,s),5))

Analysis

From the results ,we can see the abscissa of the intersection points is in 1~2 when $k<500$.When $k=500,1000$,there are infinite intersections.

Homework 8 (2018/11/30)

Question 1 (Exercise 11.6)

Write a function to compute the cdf of the Cauchy distribution, which has density $$\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},-\infty0$. Compare your results to the results from the R function pcauchy. (Also see the source code in pcauchy.c.)

set.seed(1)
f=function(x,theta,eta){
  1/(theta*pi*(1+((x-eta)/theta)^2))
}
res=integrate(f,lower=-Inf,upper=6,rel.tol=.Machine$double.eps^0.25,theta=1,eta=1)
print(round(rbind(res$value,pcauchy(6,lower.tail = TRUE)),5))

Analysis

From the result,we can see the value from this method is 0.93717,the value from the R function pcauchy is 0.94743, they are very colsed.

Question 2

A-B-O blood type problem

Let the three alleles be A, B and O.

| Genotype | AA | BB | OO | AO | BO | AB | AA | | -------- | -----: | :----: | -----: | :----: | -----: | :----: | -----: | | Frenquency | $p^{2}$ | $q^{2}$ | $r^{2}$ | 2pr | 2qr | 2pq | 1 | | Count | nAA | nBB | nOO | nAO | nBO | nAB | nAA |

Observed data: $$n_{A.}=n_{AA}+n_{AO}=28(A-type)$$, $$n_{B.}=n_{BB}+n_{BO}=24(B-type)$$, $$n_{OO}=41(O-type)$$, $$n_{AB}=70(AB-type)$$.

Use EM algorithm to solve MLE of $p$ and $q$ (consider missing data $n_{AA}$ and $n_{BB}$).

Record the maximum likelihood values in M-steps,are they increasing?

ABOexpect=function(nA,nB,p,q,r) {
    nAA=nA * p / (p + 2*r)
    nAO=nA - nAA
    nBB=nB * q / (q + 2*r)
    nBO=nB - nBB
    N=list(AA=nAA,AO=nAO,BB=nBB,BO=nBO)
  }

  ABOmaximize=function(nAA,nAO,nBB,nBO,nAB,nO) {
    p=(2 * nAA + nAO + nAB) / (2 * (nA + nB + nO + nAB))
    q=(2 * nBB + nBO + nAB) / (2 * (nA + nB + nO + nAB))
    r= 1.0 - p - q
    L=list(p=p,q=q,r=r)
  }

  #initial guess
  p=0.3
  q=0.2
  r=0.5
  ## Observed data (counts of people with the four blood groups)
  nA =28
  nB=24
  nAB=41
  nO=40

  ## Set up iteration
  iter=1
  diff=1
  tol=0.0001 ## tolerance

  while (diff > tol) {
    E=ABOexpect(nA,nB,p,q,r)
    M=ABOmaximize(E$AA,E$AO,E$BB,E$BO,nAB,nO)
    diff=abs(M$p - p) + abs(M$q - q) + abs(M$r -r)
    p=M$p
    q=M$q
    r=M$r
    cat(sprintf("iter:%d, diff:%.2f\n",iter,diff))
    iter=iter + 1
  }

  cat(sprintf("p=%.4f,q=%.4f, r=%.4f\n",p,q,r))

Analysis

From the result, we can see the MLE of the $p,q$ are respectivly 0.2847, 0.2649.

Homework 9 (2018/12/7)

Question 1 (Page 204 Exercise 3)

Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list:

formulas=list(

mpg~disp,

mpg~I(1/disp),

mpg~disp+wt,

mpg~I(1/disp)+wt

)

set.seed(1)
library(base)
 formulas=list(
      mpg ~ disp,
      mpg ~ I(1 / disp),
      mpg ~ disp + wt,
      mpg ~ I(1 / disp) + wt
    )


#for loops
out_formulas=vector('list', length(formulas))
for(i in seq_along(formulas)) {
  out_formulas[[i]]=lm(formulas[[i]], data = mtcars)
  }
out_formulas


#lapply
lapply(formulas, lm, mtcars)

Question 2 (Exercise 4)

Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply() . Can you do it without an anonymous function?

bootstraps=lapply(1:10, function(i) {

rows=sample(1:nrow(mtcars), rep = TRUE)

mtcars[rows, ]

})

set.seed(1)
bootstraps=lapply(1:10, function(i) {
  rows=sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ]
}
  )


#for loops
out_bootstraps=vector('list', length(bootstraps))
for(i in seq_along(bootstraps)) {
  out_bootstraps[[i]]=lm(mpg~disp, data = bootstraps[[i]])
  }
out_bootstraps


#lapply
lapply(bootstraps, lm, formula = mpg~disp)

Question 3 (Exercise 5)

For each model in the previous two exercises, extract $R^{2}$ using the function below.

rsq=function(mod) summary(mod)$r.squared

set.seed(1)
rsq=function(mod) summary(mod)$r.squared

#Exercise 3
lapply(out_formulas, rsq)

#Exercise 4
lapply(out_bootstraps, rsq)

Question 4 (Page 214 Exercise 3)

The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.

trials=replicate(100,t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE)

Extra challenge: get rid of the anonymous function by using[[ directly.

set.seed(1)
p=numeric(100)
trials= replicate(100,t.test(rpois(10, 10), rpois(7, 10)),simplify = FALSE)

#for loops
for(i in 1:100){
  p[i]=trials[[i]]$p.value
}
p


#sapply
x=c(1:100)
p1=sapply(trials,function(mod) mod$p.value)
p1

#anonymous function
p2=sapply(trials,'[[','p.value')
p2


#compare the consequence of the for loops and sapply
identical(p,p1) 
identical(p,p2)

Question 5 (Page 214 Exercise 6)

Implement a combination of Map() and vapply() to create an lapply() variant that iterates in parallel over all of its inputs and stores its outputs in a vector (or a matrix). What arguments should the function take?

set.seed(1)
library(parallel)
library(foreach)

#parallel
mcvMap=function(f, FUN.VALUE , ...) {
    out=Map(f, ...)
    vapply(out, FUN, FUN.VALUE)
}


#foreach
y=foreach(x=1:1000,.combine='rbind') %do% exp(x)

Homework 10 (2018/12/14)

Question 1 (17.5.1 Exercise 4)

Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test() or by coding from the mathematical definition ( http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test ).

set.seed(1)
options(warn = -1)
chisq_test2=function(x, y) {
    n=sum(x) + sum(y)
    ni._x=sum(x)
    ni._y=sum(y)
    chisq.statistic=0
    for (i in seq_along(x)) {
      n.j=x[i] + y[i]
      expected_x=(ni._x*n.j)/n
      expected_y=(ni._y*n.j)/n
      chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x
      chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y
    }
    chisq.statistic
  }

x=c(3,8,4,9)
y=c(6,9,3,5)
a=matrix(c(x,y),4,2)
print(chisq_test2(x,y)) 
print(chisq.test(a)$statistic) 



#compare the computation time of the two functions

microbenchmark::microbenchmark(chisq_test2(x,y),chisq.test(a))

Analysis

From the results,we can see the function chisq_test2 is mucher faster than the function chisq.test.

Question 2 (17.5.1 Exercise 5)

Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?

set.seed(1)
x=c(3,8,4,9)
y=c(6,9,3,5)
b=table(x,y)
summary(b)

fjv587/StatComp18082 documentation built on May 29, 2019, 8:34 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

fjv587/StatComp18082 Three functions for 'Statistical Computing' course

In fjv587/StatComp18082: Three functions for 'Statistical Computing' course

Function 1

Function 2

Function 3

Homework 1 (2018/9/14)

Question 1

Answer

Question 2

Answer

Question 3

Answer

Homework 2 (2018/9/21)

Question 1

Exercise 3.5

Answer

Question 2

Exercise 3.7

Question 3

Exercise 3.12

Homework 3 (2018/9/28)

Question 1 (Exercise 5.4)

Answer

Analysis:

Question 2 (Exercise 5.9)

Answer

Analysis:

Question 3 (Exercise 5.13)

Answer

Analysis:

Question 4 (Exercise 5.14)

Answer

Analysis:

Homework 4 (2018/10/12)

Question 1 (Exercise 6.9)

Anwser

Analysis:

Question 2 (Exercise 6.10)

Answer

Analysis:

Question 3(Projects 6.B)

Answer

Homework 5 (2018/11/2)

Question 1(Exercise 7.1)

Answer

Analysis

Question 2(Exercise 7.5)

Answer

Analysis

Question 3 (Exercise 7.8)

Answer

Analysis

Qusetion 4 (Exercise 7.11)

Answer

Analysis

Homework 6 (2018/11/16)

Question 1 (Exercise 8.1)

Analysis

Qusetion 2

Homework 7 (2018/11/23)

Question 1

Analysis

Question 2(Exercise 11.4)

Analysis

Homework 8 (2018/11/30)

Question 1 (Exercise 11.6)

Analysis

Question 2

Analysis

Homework 9 (2018/12/7)

Question 1 (Page 204 Exercise 3)

Question 2 (Exercise 4)

Question 3 (Exercise 5)

Question 4 (Page 214 Exercise 3)

Question 5 (Page 214 Exercise 6)

Homework 10 (2018/12/14)

Question 1 (17.5.1 Exercise 4)

Analysis

Question 2 (17.5.1 Exercise 5)

R Package Documentation

fjv587/StatComp18082
Three functions for 'Statistical Computing' course