Homework-2018.09.14

Question

Write a .Rmd file to implement at least three examples of different types in the book "R for Beginners".

Answer

Example 1 (Page_35)

  • A simple matrix calculation in R
A<-matrix(1:9,3,3)
B<-diag(2,3,3)
C<-A%*%B
C

Example 2 (Page_50)

  • An example shows how to Plot the empirical density function
library(lattice)
n <- seq(5, 20, 5)
x <- rnorm(sum(n))
y <- factor(rep(n, n), labels=paste("n =", n))
densityplot(~ x | y,
panel = function(x, ...) {
panel.densityplot(x, col="DarkOliveGreen", ...)
panel.mathdensity(dmath=dnorm,
args=list(mean=mean(x), sd=sd(x)),
col="darkblue")
})

Example 3 (Page_22)

  • Using the function ts to create a time series
ts(1:47, frequency = 12, start = c(2018, 9))

Homework-2018.09.21

Question 1

  • A discrete random variable X has probability mass function

| x | 0 | 1 |2 |3 |4 | |--:|--:|--:|--:|--:|--:| |p(x)|0.1|0.2|0.2|0.2|0.3|

  • Use the inverse transform method to generate a random sample of size 1000 from the distribution of X. Construct a relative frequency table and compare the empirical with the theoretical probabilities. Repeat using the R sample function.

Answer 1

set.seed(123)
#The inverse transform method
n=1000                        #size of sample
u=runif(n)                    #generate u from [0,1] uniform distribution
x=0:4                         #The set of values of variable X 
p=c(.1,.2,.2,.2,.3)           #The probability of X
cp=cumsum(p)                  #The cumulative probability of X
r=x[findInterval(u,cp)+1]     #Find the corresponding value of x
ct <- as.vector(table(r))     #Compare to the real probability
ct/sum(ct)/p                  #A value close to 1 indicates a more accurate result

#Repeat using the R sample function.
r.sample<- sample(x, size = 1000, replace = TRUE,prob = p)
ct.sample <- as.vector(table(r.sample))
ct.sample/sum(ct.sample)/p     

Question 2

  • Write a function to generate a random sample of size n from the Beta(a, b) distribution by the acceptance-rejection method. Generate a random sample of size 1000 from the Beta(3,2) distribution. Graph the histogram of the sample with the theoretical Beta(3,2) density superimposed.

Answer2

random.Beta<-function(a,b,n){
 if(a<=0||b<=0) stop("alphas and betas must be positive")
 else if(a<=1||b<=1)
   stop("The maximum of Beta(a,b) is infinite,so This method is not applicable")
 else {
   j<-k<-0;y <- numeric(n)
   while (k < n) {
   u <- runif(1)
   j <- j + 1
   x <- runif(1)      #random variate from g
   x0=(a-1)/(a+b-2)   #x0 is the maximum point of the theoretical density of Beta(a,b)
   c=dbeta(x0,a,b)    #c is the maximum of the theoretical density of Beta(a,b)
   if (x * (1-x) > c*u) {             #we accept x
     k <- k + 1
     y[k] <- x
     }
   }
}
 return(y)
}
x=random.Beta(3,2,1000)
hist(x, prob = T, main ="Histogram of random observations
from Beta(3,2)   vs.  The theoretical density of Beta(3,2)",breaks=seq(0,1,0.05))
curve(dbeta(x,2,2),add=T,col="red")
#random.Beta(3,2,1000)
    ##jie suan fa 

Quention 3

  • Simulate a continuous Exponential-Gamma mixture. Suppose that the rate parameter $\Lambda$ has $Gamma(\gamma, \beta)$ distribution and Y has $\Lambda$ distribution. That is, $(Y |\Lambda = \lambda) \sim f_{Y}(y|\lambda) = \lambda e^{−\lambda y}$. Generate 1000 random observations from this mixture with $\gamma = 4$ and $\beta = 2$.

Answer3

pdf.Y<-function(x,r,beta){
      y=r*beta^r/((beta+y)^(r+1))
      return(y)
}
n <- 1000 
r <- 4; beta <- 2
lambda <- rgamma(n, r, beta)
x <- rexp(n, lambda)
hist(x, prob = T, main ="Histogram of random observations
from this mixture    vs.  The theoretical density of Y",breaks=seq(0,12,0.5))
y <- seq(0, 12, .1)
lines(y,pdf.Y(y,r,beta),col="red")  ## red line is the theoretical density of Y

Homework-2018.09.28

Question 1

  • Write a function to compute a Monte Carlo estimate of the $Beta(3, 3)$ cdf, and use the function to estimate $F(x)$ for $x = 0.1, 0.2,\cdots, 0.9$ . Compare the estimates with the values returned by the pbeta function in R.

Answer 1

set.seed(15656)
MC.Beta33<-function(x){
   m=10000     # The number of random Numbers
   n=length(x) 
   y=NULL
   if(x<=0||x>=1) stop("x must be in (0,1)")
   else {
     for (i in 1:n) {
        g=x[i]*dbeta(runif(m,min=0,max=x[i]),3,3)  #Generate MC random Numbers
        y[i]=mean(g) #the  estimated values
     }
   }
 return(y)
}
x=seq(.1,.9,.1)
ture=pbeta(x,3,3) 
y=MC.Beta33(x) 
print(ture)   ##print the values returned by the pbeta function in R
print(y)      ##print the value of simulation by MC method
print(y/ture) ##print the ratio
  • The ratio shows the outcome is ture.

Question 2

  • The Rayleigh density [156, (18.76)] is $f(x)= {x\over\sigma ^{2}} e^{-x^{2}\over2\sigma^{2}}, x>=0,\sigma>0$. Implement a function to generate samples from a Rayleigh($\sigma$) distribution, using antithetic variables. What is the percent reduction in variance of $X+X^{'} \over 2$ compared with $X_{1}+X_{2} \over 2$ for independent $X_{1},X_{2}$?

Answer 2

library(VGAM)
MC.Rayleigh<-function(x,sigma,antithetic = TRUE){
   m=10000     # The number of random Numbers
   n=length(x) 
   y=NULL
   if(x<0||sigma<=0) stop("x should be non-negative and sigma  should be positive")
   else {
     for (i in 1:n) {
        #Using antithetic variables to generate MC random Numbers
        u=runif(m/2,min=0,max=x[i])
        if (!antithetic) v <- runif(m/2,min=0,max=x[i]) else v <- x[i] - u
        U=c(u,v)
        g=x[i]*drayleigh(U, sigma)
        y[i]=mean(g)  #the  estimated values
     }
   }
 return(y)
}
##using the rayleigh distribution function in R to check the correctness of the estimation result 
y1=MC.Rayleigh(0.3,1,antithetic = F)  # the  estimated values without using antithetic variables
y2=MC.Rayleigh(0.3,1)                 # the  estimated values by using antithetic variables
ture=prayleigh(0.3, 1)
print(c(y1/ture,y2/ture))             # The ratio shows the outcome is ture.

## Computing the percent reduction 
m=1000
MC1 <- MC2 <- numeric(m)
for (i in 1:m) {
  MC1[i] <- MC.Rayleigh(1, 2, anti = FALSE)
  MC2[i] <- MC.Rayleigh(1, 2)
}
c(sd(MC1),sd(MC2),sd(MC2)/sd(MC1))
  • The results show that using antithetic variables can effectively reduce the variance of the estimated results

Question 3

  • Find two importance functions $f_1$ and $f_2$ that are supported on $(1, \infty)$and are ‘close’ to $g(x) = {x^2 \over \sqrt{2\pi}} e^{-x^2 \over 2}, x > 1.$ Which of your two importance functions should produce the smaller variance in estimating $\int _{1} ^{\infty} {x^2\over \sqrt{2\pi}} e^{-x^2 \over 2}$ by importance sampling? Explain.

Answer 3

x <- seq(1, 3, .1)
w <- 2
g <- x^2/sqrt(2*pi)*exp(-x^2/2)
f1 <- exp(-x^2/2)/sqrt(2*pi)
f2<- x^2*exp(-x/2)/16
gs <- c(expression(g(x)==x^2/sqrt(2*pi)*e^{-x^2/2}),expression(f[1](x)==e^{-x^2/2}/sqrt(2*pi)),expression(f[2](x)==e^{-x/2}*{x^2}/ 16))
 par(mfrow=c(1,2))
    #figure (a)
    plot(x, g, type = "l", ylab = "",
         ylim = c(0,1), lwd = w,col=1,main='(A)')
    lines(x,f1, lty = 2, lwd = w,col=2)
    lines(x,f2, lty = 3, lwd = w,col=3)
    legend("topleft", legend = gs,
           lty = 1:3, lwd = w, inset = 0.02,col=1:3,cex=0.7)

    #figure (b)
    plot(x, g/f1, type = "l", ylab = "",
        ylim = c(0,10), lwd = w, lty = 2,col=2,main='(B)')
    lines(x, g/f2, lty = 3, lwd = w,col=3)
    legend("topleft", legend = gs[-1],
           lty = 2:3, lwd = w, inset = 0.02,col=2:3,cex=0.7)

Explain

  • The function $f_2$ should produce the smaller variance.Because when $x>2$ , the pictures above show that the value of $g/f_1$ is always smaller than $g/f_2$.

Question 4

  • Obtain a Monte Carlo estimate of $\int _{1} ^{\infty} {x^2\over \sqrt{2\pi}} e^{-x^2 \over 2}$ by importance sampling.

Answer 4

g.cut<-function(x,t){   # t is the lower limit of integral
   n=length(x)
   y=numeric(n)
   for(i in 1:n) {
     if(x[i]>t) y[i]=x[i]^2/sqrt(2*pi)*exp(-x[i]^2/2)
     else y[i]=0
   }
   return(y)
}
MC.g<-function(t){ # t is the lower limit of integral
   m=1000000     # The number of random Numbers
   n=length(t) 
   y=numeric(n)
     for (i in 1:n) {
        u=rchisq(m,6) 
        G <- g.cut(u,t[i])
        f2 <- u^2*exp(-u/2)/16
        y[i]=mean(G/f2) #the  estimated values
     }
  return(y)
} 
MC.g(0) ## this outcome shows my program is ture.
MC.g(1) ## the  estimated value
  • The outcome is $\int _{1} ^{\infty} {x^2\over \sqrt{2\pi}} e^{-x^2 \over 2} \approx 0.3982884$

In order to verify the correctness of the program, I also calculate $\int {0} ^{\infty} {x^2\over \sqrt{2\pi}} e^{-x^2 \over 2}$ by the program,whose true value is 0.5. The outcome shows that $\int {0} ^{\infty} {x^2\over \sqrt{2\pi}} e^{-x^2 \over 2} \approx 0.4994781$,which is close to 0.5. So,we can infer the program is ture.

Homework-2018.10.12

Question 1

  • Let X be a non-negative random variable with $\mu = E[X] < \infty$. For a random sample $x_1, . . . , x_n$ from the distribution of $X$, the Gini ratio is defined by $$G=\frac{1}{2n^2\mu}\sum_{j=1}^{n}\sum_{i=1}^{n}|x_i-x_j|$$ The Gini ratio is applied in economics to measure inequality in income distribution (see e.g. [163]). Note that $G$ can be written in terms of the order statistics $x_{(i)}$ as $$G=\frac{1}{n^2\mu}\sum_{i=1}^{n}(2i-n-1)x_{(i)}$$ If the mean is unknown,let $\hat{G}$ be the statistic $G$ with $\mu$ replacedby $\bar{x}$. Estimate by simulation the mean, median and deciles of $\hat{G}$ if $X$ is standard lognormal. Repeat the procedure for the uniform distribution and Bernoulli(0.1). Also construct density histograms of the replicates in each case.

Answer 1

set.seed(15411)
library(MASS)
Gini<-function(x){
  n<-length(x)
  a<-seq(1-n,n-1,2)
  x.i<-sort(x)
  Gini.hat<-sum(a*x.i)/(n*n*mean(x))
  return(Gini.hat)
} # you can estimate a G.hat if there comes a sample


## X is standard lognormal
n=200
m<-2000
Gini.hat1<-numeric(m)
for(i in 1:m){
  y<-rnorm(n)
  x<-exp(y) # then x is standard lognormal
  Gini.hat1[i]<-Gini(x)
}
result1<-c(mean(Gini.hat1),quantile(Gini.hat1,probs=c(0.5,0.1)))
names(result1)<-c("mean","median","deciles")
print(result1)
hist(Gini.hat1,breaks=seq(min(Gini.hat1)-0.01,max(Gini.hat1)+0.01,0.01),freq =F,main = "Histogram of Gini",xlab = "Standard lognormal")

## X is uniform
Gini.hat2<-numeric(m)
for(i in 1:m){
  x<-runif(n) # then x is uniform
  Gini.hat2[i]<-Gini(x)
}
result2<-c(mean(Gini.hat2),quantile(Gini.hat2,probs=c(0.5,0.1)))
names(result2)<-c("mean","median","deciles")
print(result2)
hist(Gini.hat2,breaks =seq(min(Gini.hat2)-0.01,max(Gini.hat2)+0.01,0.01) ,freq =F,main = "Histogram of Gini",xlab = "Uniform")

## x is  Bernoulli(0.1)
Gini.hat3<-numeric(m)
for(i in 1:m){
  x<-rbinom(n,1,0.1) # then x is Bernoulli(0.1)
  Gini.hat3[i]<-Gini(x)
}
result3<-c(mean(Gini.hat3),quantile(Gini.hat3,probs=c(0.5,0.1)))
names(result3)<-c("mean","median","deciles")
print(result3)
hist(Gini.hat3,breaks=seq(min(Gini.hat3)-0.01,max(Gini.hat3)+0.01,0.01),freq =F,main = "Histogram of Gini",xlab = "Bernoulli(0.1)")

Question 2

  • Construct an approximate 95% confidence intervalfor the Gini ratio $\gamma=E[G]$ if $X$ is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.

Answer 2

  • Solve: When we get the sample G ,we use $\hat\gamma=\bar{G},\hat\sigma_G=\frac{1}{m-1}\sum_{i=1}^{m}(G_i-\bar{G})^2$ for estimation,and then the approximate 95% confidence interval should be $(\bar{G}-\hat{\sigma t_{n-1}(\alpha/2)},\bar{G}+\hat{\sigma t_{n-1}(\alpha/2)})$ And I choose $In(x)\sim N(0,10)$ as the example.
n<-200
m<-2000
mu<-0
sigma<-10
Gini1<-function(n,mu,sigma){
  y<-rnorm(n,mu,sigma)
  x<-exp(y)
  Gini.sample<-Gini(x)
  return(Gini.sample)
}
Gini.sp<-numeric(m)
for(i in 1:m){
Gini.sp[i]<-Gini1(n,mu,sigma)
}
CI<-c(mean(Gini.sp)-sd(Gini.sp)*qt(0.975,m-1),mean(Gini.sp)+sd(Gini.sp)*qt(0.975,m-1))
print(CI)          #The  approximate 95% conffidence interval 
cover.rate<-sum(I(Gini.sp>CI[1]&Gini.sp<CI[2]))/m
print(cover.rate)  #The result can show the CI is efficient.

Question 3

  • Tests for association based on Pearson product moment correlation $\rho$, Spearman's rank correlation coeffcient $\rho_{s}$ or Kendall's coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_s$ or$\tau$ are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X,Y) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.

Answer 3

  • Solve: After many attempts, we find that $$(X,Y)\sim N(0,\Sigma), \Sigma= \left[ \begin{matrix} 1.5 & 0.375 \ 0.375 & 4 \end{matrix}\right] $$ is a proper case.And in this case, the ture correlation of X and Y are 0.375
m<-1000
sigma1<-matrix(c(1.5,0.375,0.375,4),2,2)
p.value1=p.value2=p.value3=numeric(m)
for(i in 1:1000){
x1<-mvrnorm(20,mu=c(0,0),Sigma = sigma1)
p.value1[i]<-cor.test(x1[,1],x1[,2],method = "pearson")$p.value
p.value2[i]<-cor.test(x1[,1],x1[,2],method = "spearman")$p.value
p.value3[i]<-cor.test(x1[,1],x1[,2],method = "kendall")$p.value
}
#the power
power<-c(mean(p.value1<=0.05),mean(p.value2<=0.05),mean(p.value3<=0.05))
names(power)<-c("Pearson","Spearman","Kendall")
print(power) ##This result shows that the power of Spearman is bigger than the power of Pearson.

Homework-2018-11-02

Question1

  • Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.
library(boot)
library(bootstrap)
library(DAAG)

Answer1

set.seed(1)
attach(law)
b.cor<-function(x,i){
      y=cor(x[1,i],x[2,i])
      return(y)
}
x=t(law)
n=nrow(law)
theta.hat <- b.cor(x,1:n)
theta.jack <- numeric(n)
for(i in 1:n){
theta.jack[i] <- b.cor(x,(1:n)[-i])
}
bias.jack <- (n-1)*(mean(theta.jack)-theta.hat)
se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2))
round(c(original=theta.hat,bias=bias.jack,se=se.jack),3)

Question2

  • Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2. Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures $1/\lambda$ by the standard normal, basic, percentile, and BCa methods. Compare the intervals and explain why they may differ.

Answer2

  • Because they use the quantile of different distribution.
attach(aircondit)
X=aircondit$hours

mle.boot<-function(X,i) mean(X[i])
de <- boot(data=X,statistic=mle.boot, R =1000)
ci <- boot.ci(de,type=c("norm","basic","perc","bca"))
ci$norm[2:3]
ci$basic[4:5]
ci$percent[4:5]
ci$bca[4:5]

Question3

  • Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat{\theta}$.

Answer3

attach(scor)
X=scor
n=dim(X)[1]
p=dim(X)[2]
eigen=numeric(p)
A<-diag(p)
theta.jac=numeric(n)
eigen=eigen(cov(X))$values
theta.hat=max(eigen)/sum(eigen)
for(i in 1:n){
   A=cov(scor[-i,])
   eigen=eigen(A)$values
   theta.jac[i]=max(eigen)/sum(eigen)
}
bias.jac <- (n-1)*(mean(theta.jac)-theta.hat)
se.jac <- sqrt((n-1)*mean((theta.jac-theta.hat)^2))
round(c(original=theta.hat,bias=bias.jack, se=se.jack),3)

Question4

  • In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.

Answer4

  • According to the prediction error criterion, Model 2, the quadratic model, would be the best fit for the data.
library(DAAG)
attach(ironslag)
n <- length(magnetic) #in DAAG ironslag
e1 <- e2 <- e3 <- e4 <- cbind(rep(0,n*(n-1)/2),rep(0,n*(n-1)/2))
k=1
for(i in 1:(n-1)){
   for(j in (i+1):n ){
         y <- magnetic[-c(i,j)]
         x <- chemical[-c(i,j)]
         J1 <- lm(y ~ x)
         yhat11 <- J1$coef[1] + J1$coef[2] * chemical[i]
         yhat12 <- J1$coef[1] + J1$coef[2] * chemical[j]
         e1[k,1]=yhat11-magnetic[i]
         e1[k,2]=yhat12-magnetic[j]

         J2 <- lm(y ~ x + I(x^2))
         yhat21 <- J2$coef[1] + J2$coef[2] * chemical[i] +J2$coef[3] * chemical[i]^2
         yhat22 <- J2$coef[1] + J2$coef[2] * chemical[j] +J2$coef[3] * chemical[j]^2
         e2[k,1]=yhat21-magnetic[i]
         e2[k,2]=yhat22-magnetic[j]
         J3 <- lm(log(y) ~ x)
         logyhat31 <- J3$coef[1] + J3$coef[2] * chemical[i]
         yhat31 <- exp(logyhat31)
         logyhat32 <- J3$coef[1] + J3$coef[2] * chemical[j]
         yhat32 <- exp(logyhat32)
         e3[k,1] <-  yhat31-magnetic[i]
         e3[k,2] <-  yhat32-magnetic[j] 
         J4 <- lm(log(y) ~ log(x)) 
         logyhat41 <- J4$coef[1] + J4$coef[2] * log(chemical[i])
         logyhat42 <- J4$coef[1] + J4$coef[2] * log(chemical[j])
         yhat41 <- exp(logyhat41)
         yhat42 <- exp(logyhat42)
         e4[k,1] <-  yhat41 -magnetic[i]
         e4[k,2] <-  yhat42 -magnetic[j]
         k=k+1
    }
}

c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))

Homework-18026-2018-11-16

Question1

  • Implement the two-sample Cramer-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2

Answer1

set.seed(1)
library(boxplotdbl)
attach(chickwts)
cvm.ts<-function(x,y){
  n=length(x);m=length(y)
  fn=ecdf(x);gn=ecdf(y)
  w2=m*n/((m+n)^2)*(sum((fn(x)-gn(x))^2)+sum((fn(y)-gn(y))^2))
  return(w2)
}
N=999
x<-sort(as.vector(weight[feed=="soybean"]))
y<-sort(as.vector(weight[feed=="linseed"]))
detach(chickwts)
z=c(x,y)
K=1:26
reps=numeric(N)
cvm0=cvm.ts(x,y)
for (i in 1:N) {
  k<-sample(K,size = 14,replace = F)
  x1<-z[k]
  y1<-z[-k]
  reps[i]<-cvm.ts(x1,y1)
}
p<-mean(c(cvm0,reps)>=cvm0)
p#0.421 does not support the alternative hypothesis that distributions differ
hist(reps,main="",freq=F,xlab="W2(p=0.421)",breaks = "scott")
points(cvm0,0,cex=1,pch=16)
  • 0.421 does not support the alternative hypothesis that distributions differ.

Quention2

  • Design experiments for evaluating the performance of the NN, energy, and ball methods in various situations.
    • Unequal variances and equal expectations
    • Unequal variances and unequal expectations
    • Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
    • Unbalanced samples (say, 1 case versus 10 controls)
    • Note: The parameters should be chosen such that the powers are distinguishable (say, range from 0.3 to 0.8).

Answer2

library(Ball)
library(energy)
library(boot)
library(RANN)

Design experiments

m<-1e2;k<-3;p<-2;mu<-0.5;R<-999;
p.values <- matrix(0,m,3)

Tn<-function(z,ix,sizes,k){
n1<-sizes[1];n2<-sizes[2];n<-n1+n2
if(is.vector(z)) z<-data.frame(z,0)
z<-z[ix, ]
NN<-nn2(data=z,k=k+1)
block1<-NN$nn.idx[1:n1,-1] 
block2<-NN$nn.idx[(n1+1):n,-1] 
i1<-sum(block1 <= n1)
i2<-sum(block2 >n1) 
(i1+i2)/(k*n)
}

eqdist.nn<-function(z,sizes,k){
boot.obj<-boot(data=z,statistic=Tn,R=R,sim="permutation",sizes=sizes,k=k)
ts<-c(boot.obj$t0,boot.obj$t)
p.value<-mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
  • Unequal variances and equal expectations
n1<-n2<-50;n<-n1+n2;N=c(n1,n2);mu1=mu2=0;sigma1=1;sigma2=2  
for(i in 1:m){
x<-rnorm(n1,mu1,sigma1)
y<-rnorm(n2,mu2,sigma2)
z<-c(x,y)
p.values[i,1]<-eqdist.nn(z,N,k)$p.value
p.values[i,2]<-eqdist.etest(z,sizes=N,R=R)$p.value
p.values[i,3]<-bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha<-0.1
pow<-colMeans(p.values<alpha)
pow
  • Unequal variances and unequal expectations
n1<-n2<-50;n<-n1+n2;N=c(n1,n2);mu1=1;mu2=2;sigma1=1;sigma2=2   
for(i in 1:m){
x<-rnorm(n1,mu1,sigma1)
y<-rnorm(n2,mu2,sigma2)
z<-c(x,y)
p.values[i,1]<-eqdist.nn(z,N,k)$p.value
p.values[i,2]<-eqdist.etest(z,sizes=N,R=R)$p.value
p.values[i,3]<-bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha<-0.1
pow<-colMeans(p.values<alpha)
pow
  • t distribution with 1 df,bimodel distribution
n1<-n2<-50;n<-n1+n2;N=c(n1,n2);mu1=1;mu2=2;sigma1=1;sigma2=2  
for(i in 1:m){
x<-rt(n1,1)
y<-c(rnorm(n2/2,mu1,sigma1),rnorm(n2/2,mu2,sigma2))
z<-c(x,y)
p.values[i,1]<-eqdist.nn(z,N,k)$p.value
p.values[i,2]<-eqdist.etest(z,sizes=N,R=R)$p.value
p.values[i,3]<-bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha<-0.1
pow<-colMeans(p.values<alpha)
pow
  • Unbalanced samples
n1<-10;n2<-100;n<-n1+n2;N=c(n1,n2) 
for(i in 1:m){
x<-rnorm(n1)
y<-rnorm(n2)
z<-c(x,y)
p.values[i,1]<-eqdist.nn(z,N,k)$p.value
p.values[i,2]<-eqdist.etest(z,sizes=N,R=R)$p.value
p.values[i,3]<-bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
}
alpha<-0.1
pow<-colMeans(p.values<alpha) 
pow

Question3

  • Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchy distribution (see qcauchy or qt with df=1). Recall that a Cauchy($\theta, \eta$) distribution has density function $$f(x)=\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},-\infty0$$ The standard Cauchy has the Cauchy$\theta=1, \eta=0$) density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)

Answer3

 f_cauchy<-function(x,u,lamada){
    return(lamada/(pi*(lamada^2+(x-u)^2)))
} #the density function of cauchy

cauchy.chain<-function(n,sigma,x0,u,lamada){
    #n:length of the chain
    #sigma:standard error of the proposal distribution
    #x0:initial value
    #u,lamada:parameter of cauchy distribution
    x<-NULL
    x[1]<-x0
    e=runif(n)
    k<-0
    for(i in 2:n){
      y<-rnorm(1,x[i-1],sigma)
      if(e[i]<=(f_cauchy(y,u,lamada)/f_cauchy(x[i-1],u,lamada)))  x[i]<-y
      else{
      x[i]<-x[i-1]
      k<-k+1
      }
    }
    return(list(x=x,k=k))
  } 
  n<-10000
  cauchy<-cauchy.chain(n,sigma=0.5,x0=0,u=0,lamada=1)
  refuse.pr<-cauchy$k/n
  refuse.pr
  #qq plot
  par(mfrow=c(1,1))
  qqplot(rcauchy(n-1000,0,1),cauchy$x[1001:n])
  qqline(cauchy$x[1001:n])
  #histplot
  hist(cauchy$x[1001:n],freq=F,main="density of cauchy",breaks=60)
  curve(f_cauchy(x,0,1),add=TRUE)

Question4

  • Rao [220, Sec. 5g] presented an example on genetic linkage of 197 animals in four categories (also discussed in [67, 106, 171, 266]). The group sizes are (125,18,20,34). Assume that the probabilities of the corresponding multinomial distribution are $$(\frac{1}{2}+\frac{\theta}{4}, \frac{1-\theta}{4}, \frac{1-\theta}{4},\frac{\theta}{4})$$. Estimate the posterior distribution of $\theta$ given the observed sample, using one of the methods in this chapter.

Answer4

  • The posterior distribution of $\theta$ given $(x_1,x_2,x_3,x_4)=(125,18,20,34)$ is therefore $$Pr(\theta|(x_1,x_2,x_3,x_4))=\frac{197!}{x_1!x_2!x_3!x_4!}p_1^{x_1}p_2^{x_2}p_3{x_3}p_4^{x_4}$$.where $(p_1,p_2,p_3,p_4)=(\frac{1}{2}+\frac{\theta}{4}, \frac{1-\theta}{4}, \frac{1-\theta}{4},\frac{\theta}{4})$
  w <-0.25 #width of the uniform support set 
  n <- 5000 #length of the chain 
  burn <- 1000 #burn-in time 
  y<-c(125,18,20,34)
  x <- numeric(n) #the chain

  prob <- function(b, y) { 
    # computes (without the constant) the target density 
    if (b < 0 || b >= 1) return (0)
    return((1/2+b/4)^y[1] * ((1-b)/4)^y[2] * ((1-b)/4)^y[3] * (b/4)^y[4])
  }

  u <- runif(n) #for accept/reject step
  v <- runif(n, -w, w) #proposal distribution 
  x[1] <-0.25 
  for (i in 2:n) { 
    z <- x[i-1] + v[i] 
    if (u[i] <= prob(z,y) / prob(x[i-1],y)) 
      x[i] <-z 
    else 
      x[i] <- x[i-1] 
  }

   xb <- x[(burn+1):n]
   xc<-mean(xb)
   print(xc)  #estimation value of theta
   print(y/sum(y))
   print(c(1/2+xc/4,(1-xc)/4,(1-xc)/4,xc/4))

Homework-18026-2018-11-23

Question1

  • For exercise 9.6, use the Gelman-Rubin method to monitor convergence of the chain, and run the chain until the chain has converged approximately to the target distribution according to $\hat R < 1.2$.

Answer1

set.seed(1)
Gelman.Rubin <- function(psi) {
  psi<-as.matrix(psi)        # psi[i,j] is the statistic psi(X[i,1:j])
  n<-ncol(psi)               # for chain in i-th row of X
  k<-nrow(psi)
  psi.means<-rowMeans(psi)   #row means
  B<-n*var(psi.means)        #between variance est.
  psi.w<-apply(psi,1,"var")  #within variances
  W<-mean(psi.w)             #within est.
  v.hat<-W*(n-1)/n+(B/n)     #upper variance est.
  r.hat<-v.hat/W             #G-R statistic
  return(r.hat)
}       

prob<-function(y,ob){ # computes the target density 
  if(y < 0 || y >= 1){
    return(0)
    }
  else{
    return((0.5+y/4)^ob[1]*((1-y)/4)^ob[2]*((1-y)/4)^ob[3]*(y/4)^ob[4])
    }
}

mult.chain<-function(ob,w,m,x1){
   x<-numeric(m)      #the chain
   u<-runif(m)        #for accept/reject step 
   v<-runif(m,-w,w)   #proposal distribution 
   x[1]<-x1 
   for(i in 2:m) { 
   y<-x[i-1]+v[i] 
      if(u[i]<=prob(y,ob)/prob(x[i-1],ob)){
            x[i]<-y
      }
   else{ 
            x[i]<-x[i-1]} 
   }
   return(x)
}

w<-.5               #width of the uniform support set 
ob<-c(125,18,20,34) #the observed sample
m<-10000            #length of the chain 
burn<-1000          #burn-in time 
k<-4                #number of chains to generate
x0<-c(0.5,0.9,0.1,0.75)  #choose overdispersed initial values

#generate the chains
X<-matrix(0,nrow=k,ncol=m)
for(i in 1:k){
  X[i, ]<-mult.chain(ob,w,m,x0[i])
}

psi<-t(apply(X,1,cumsum)) #compute diagnostic statistics
for(i in 1:nrow(psi)){
  psi[i,]<-psi[i,]/(1:ncol(psi))
}

# chain has approximately converged to the target distribution within approximately 3000 iterations 

Question2

Find the intersection points $A(k)$ in $(0,\sqrt{k})$ of the curves $$S_{k-1}(a)=P( t(k-1)>\sqrt{\frac{a^2(k-1)}{k-a^2}} )$$ and $$S_{k}(a)=P( t(k)>\sqrt{\frac{a^2k}{k+1-a^2}} ),$$ for $k=4:25,100,500,1000,$ where $t(k)$ is a Student $t$ random variable with $k$ degrees of freedom. (These intersection points determine the critical values for a t-test for scale-mixture errors proposed by $Sz\acute{e}kely$ [260].)

Answer2

Ak<-function(a,k){     
  t1<- sqrt(a^2*(k-1)/(k-a^2))
  t2<- sqrt(a^2*k/(k+1-a^2)) 
  return(pt(t1,df=k-1)-pt(t2,df=k))
}
##Function Ak=0 means that two curves  intersect.

K=c(4:25,100,500,1000)
ipts=numeric(length(K))
for(i in 1:length(K)){
    ipts[i]=uniroot(Ak,lower=1e-4,upper=sqrt(K[i])-(1e-4),k=K[i])$root
}
ipts #output the result(Wrong)

## Check the ipts, we can find that K>=23 leads to ipts arrive at right endpoint.
## at right endpoint. Which may be wrong.
## So we should chang the right endpoint based on the curves.
k=23  ## for example
x=seq(0.01,sqrt(k)-1e-4,length=1000)
y=Ak(x,k)
plot(x,y,type="l")
## The curve shows that the null point is smaller than 3.
## For other k is the same result.
## So chang the right endpoint to 3.
wrong=which(K>22)
for (i in wrong){
  ipts[i] <- uniroot(Ak,lower =1e-4,upper=3,k=K[i])$root
}
names(ipts)=K
ipts   ## the final result

Homework-2018-11-30

Question1

  • Write a function to compute the cdf of the Cauchy distribution, which has density $$\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2),-\infty0$. Compare your results to the results from the R function pcauchy. (Also see the source code in pcauchy.c.)

Answer1

library(nloptr)
f<-function(y,eta,theta){
#theta is scale parameter
#eta is the location parameter
1/(theta*3.141592653*(1+((y-eta)/theta)^2))
}

# the cauchy pdf
p_cauchy<-function(x,eta,theta,lower.tail=TRUE){
 if(lower.tail) res<-integrate(f,lower = -Inf,upper = x,rel.tol=.Machine$double.eps^0.25,theta=theta,eta=eta)
 else res<-integrate(f,lower = x,upper = Inf,rel.tol=.Machine$double.eps^0.25,theta=theta,eta=eta)
  return(res$value)
}
##compare
p_cauchy(x=0,eta = 0,theta = 1)
pcauchy(0,location = 0,scale = 1)

p_cauchy(x=1,eta =3,theta = 2,lower.tail = F )
pcauchy(1,location = 3,scale = 2,lower.tail = F)
  • The reslt show that we get the right result.

Question2

  • A-B-O blood type problem
  • Let the three alleles be A, B, and O. r options(warn=-1) dat <- rbind(Genotype=c('AA','BB','OO','AO','BO','AB'), Frequency=c('p2','q2','r2','2pr','2qr','2pq',1), Count=c('nAA','nBB','nOO','nAO','nBO','nAB','n')) knitr::kable(dat,format='markdown')
  • Observed data: $n_{A\cdot}=n_{AA}+n_{AO}=28$ (A-type), $n_{B\cdot}=n_{BB}+n_{BO}=24$ (B-type), $n_{OO}=41$ (O-type), $n_{AB}=70$ (AB-type).

  • Use EM algorithm to solve MLE of $p$ and $q$ (consider missing data $n_{AA}$ and $n_{BB}$).

  • Record the maximum likelihood values in M-steps, are they increasing?

Answer2

  • We can obtain the data likelihood which is $$l(p,q|n_{AA},n_{BB},n_{OO},n_{A.},n_{B.},n_{AB})=2n_{AA}log(p)+2n_{BB}log(q)+2n_{OO}log(r)+(n_{A.}-n_{AA})log(2pr)+(n_{B.}-n_{BB})log(2qr)+n_{AB}log(2pq) $$ where $r=1-p-q$. So we have to minimize $-E[l(p,q|n_{AA},n_{BB},n_{OO},n_{A.},n_{B.},n_{AB})]$.
N<-10000 #max. number of iterations 
tol<-.Machine$double.eps 
L.old<-c(.2,.35)  #initial est. for p and q 
M.value<-0
L.list<-data.frame(p=0,q=0)

mlef<-function(l,l.old,n.A=28,n.B=24,n.OO=41,n.AB=70 ){
r<-1-sum(l)
r.old<-1-sum(l.old)
n.AA<-n.A*l.old[1]^2/(l.old[1]^2+2*l.old[1]*r.old)
n.BB<-n.B*l.old[2]^2/(l.old[2]^2+2*l.old[2]*r.old)
llh<-2*n.OO*log(r)+n.AB*log(2*l[1]*l[2])+
2*n.AA*log(l[1])+2*n.BB*log(l[2])+
(n.A-n.AA)*log(2*l[1]*r)+(n.B-n.BB)*log(2*l[2]*r)
-llh
}

for(j in 1:N){
res<-optim(c(0.3,0.2),mlef,l.old=L.old)
L<-res$par
L.list[j,]<-L
M.value[j]<- -res$value
if(sum(abs(L-L.old)/L.old)<tol) break 
L.old<-L
}
L.list  #p,q
print(-M.value)  ##the max likelihood values

Homework-2018-12-07

Question1

  • Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list: formulas <- list( \

    $mpg \sim disp,$\ $mpg \sim I(1 / disp),$\ $mpg \sim disp + wt,$\ $mpg \sim I(1 / disp) + wt.$ \ )

Answer1

set.seed(123)
attach(mtcars)
formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt )
#loops
output <- vector("list", length(formulas))
for (i in seq_along(formulas)){
  output[[i]]<-lm(formulas[[i]])
}
output
#lapply
lapply(formulas,lm)

Question2

  • Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply(). Can you do it without an anonymous function? bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] })

Answer2

bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ] })

#loops
for(i in seq_along(bootstraps)){
  print(lm(mpg~disp,data =bootstraps[[i]]))
}

#lapply
lapply(bootstraps,lm,formula=mpg~disp)

Question3

For each model in the previous two exercises,extract $R^2$ using the function below. rsq <- function(mod) summary(mod)$r.squared

Answer3

rsq <- function(mod) summary.lm(mod)$r.squared
#loops
for (i in seq_along(formulas)) {
 print( rsq(lm(formulas[[i]])))
  }
#lapply
lapply(lapply(formulas,lm),rsq)
#loops
for(i in seq_along(bootstraps)){
  print(rsq(lm(mpg~disp,data =bootstraps[[i]])))
}

#lapply
lapply(lapply(bootstraps,lm,formula=mpg~disp),rsq)

Question4

  • The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial. trials <- replicate( 100, t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE ) Extra challenge: get rid of the anonymous function by using [[ directly.

Answer4

#using anonymous function
trials <- replicate( 100, t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE )
p_value<-function(mod) mod$p.value
sapply(trials, p_value)

Question5

  • Implement a combination of Map() and vapply() to create an lapply() variant that iterates in parallel over all of its inputs and stores its outputs in a vector (or a matrix). What arguments should the function take?

Answer5

  • This answer need the output of the function to be the same mode.
Mvap<-function (f,n,type, ...) {  #n is the length of output of function f. type is the mode of output of function f.
  f <- match.fun(f)
  tt=Map(f, ...)
  if(type=="numeric")  y=vapply(tt,cbind,numeric(n))
  else if (type=="character") y=vapply(tt,cbind,character(n))
  else if (type=="complex") y=vapply(tt,cbind,complex(n))
  else if (type=="logical") y=vapply(tt,cbind,logical(n))
  return(y)
}
#example
kk<-function(x,w){
  y=weighted.mean(x, w, na.rm = TRUE)
  return(c(1,y))
}
xs <- replicate(5, runif(10), simplify = FALSE)
ws <- replicate(5, rpois(10, 5) + 1, simplify = FALSE)
Mvap(kk,2,"numeric",xs, ws)
is.matrix(Mvap(kk,2,"numeric",xs, ws))

Homework-2018.12.14

Question1

  • Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test() or by coding from the mathematical definition (http://en. wikipedia.org/wiki/Pearson%27s_chi-squared_test).

Answer1

library(microbenchmark)
new.chisq.test<-function(x,y){
if(!is.vector(x) && !is.vector(y))
stop("at least one of 'x' and 'y' is not a vector")
if(typeof(x)=="character" || typeof(y)=="character")
stop("at least one of 'x' and 'y' is not a numeric vector")
if(any(x<0) || anyNA(x)) 
stop("all entries of 'x' must be nonnegative and finite")
if(any(y<0) || anyNA(y)) 
stop("all entries of 'y' must be nonnegative and finite")
if((n<-sum(x))==0) 
stop("at least one entry of 'x' must be positive")
if((n<-sum(x))==0) 
stop("at least one entry of 'x' must be positive")
if(length(x)!=length(y)) 
stop("'x' and 'y' must have the same length")
DNAME<-paste(deparse(substitute(x)),"and",deparse(substitute(y)))
METHOD<-"Pearson's Chi-squared test"
x<-rbind(x,y)
nr<-as.integer(nrow(x));nc<-as.integer(ncol(x))
sr<-rowSums(x);sc<-colSums(x);n<-sum(x)
E<-outer(sr,sc,"*")/n
STATISTIC<-sum((x - E)^2/E)
names(STATISTIC)<-"X-squared"
structure(list(statistic=STATISTIC,method=METHOD,data.name=DNAME),class="htest")
}

#example
a<-c(568,327,494);b<-c(484,339,444)    
new.chisq.test(a,b)        
chisq.test(rbind(a,b))
microbenchmark(t1=new.chisq.test(a,b),t2=chisq.test(rbind(a,b)))  

Question2

  • Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?

Answer2

new.table<-function(...,dnn = list.names(...),deparse.level = 1){
    list.names <- function(...) {
        l <- as.list(substitute(list(...)))[-1L]
        nm <- names(l)
        fixup <- if (is.null(nm)) 
            seq_along(l)
        else nm == ""
        dep <- vapply(l[fixup], function(x) switch(deparse.level + 
            1, "", if (is.symbol(x)) as.character(x) else "", 
            deparse(x, nlines = 1)[1L]), "")
        if (is.null(nm)) 
            dep
        else {
            nm[fixup] <- dep
            nm
        }
    }
    args <- list(...)
    if (!length(args)) 
        stop("nothing to tabulate")
    if (length(args) == 1L && is.list(args[[1L]])) {
        args <- args[[1L]]
        if (length(dnn) != length(args)) 
            dnn <- if (!is.null(argn <- names(args))) 
                argn
            else paste(dnn[1L], seq_along(args), sep = ".")
    }
    bin <- 0L
    lens <- NULL
    dims <- integer()
    pd <- 1L
    dn <- NULL
    for (a in args) {
        if (is.null(lens)) 
            lens <- length(a)
        else if (length(a) != lens) 
            stop("all arguments must have the same length")
        fact.a <- is.factor(a)
        if (!fact.a) {
            a0 <- a
            a <- factor(a)
        }
        ll <- levels(a)
        a <- as.integer(a)
        nl <- length(ll)
        dims <- c(dims, nl)
        dn <- c(dn, list(ll))
        bin <- bin + pd * (a - 1L)
        pd <- pd * nl
    }
    names(dn) <- dnn
    bin <- bin[!is.na(bin)]
    if (length(bin)) 
        bin <- bin + 1L
    y <- array(tabulate(bin, pd), dims, dimnames = dn)
    class(y) <- "table"
    y
}

#example
a<-b<-c(1,1,2,3)            
new.table(a,b)
table(a,b)
microbenchmark(t1=new.table(a,b),t2=table(a,b))    


YulieKing/StatComp18026 documentation built on May 29, 2019, 8:34 a.m.