Question

Write a .Rmd file to implement at least three examples of different types in the above books (texts, numerical results, tables, and figures).

Answer

Example 1

patientID <- c(1,2,3,4)
age <- c(25,34,28,52)
diabetes <- c("Type1","Type2","Type1","Type1")
status <- c("Poor","Improved","Excellent","Poor")
patientdata <- data.frame(patientID,age,diabetes,status)
patientdata
table(patientdata$diabetes,patientdata$status)

Example 2

mpg <- c(39.4,36.4,33.8,31.5,29.6,27.8,26.3,24.9)
wt <- c(1897,1665,1559,1550,1490,1500,1480,1380)
lm(mpg~wt)
lmfit <- lm(mpg~wt)
summary(lmfit)
plot(lmfit)

Question 1

A discrete random variable $X$ has probability mass function

|$x$|0|1|2|3|4| |-|-|-|-|-|-| |$p(x)$|0.1|0.2|0.2|0.2|0.3|

Use the inverse transform method to generate a random sample of size 1000 from the distribution of $X$. Construct a relative frequency table and compare the empirical with the theoretical probabilities. Repeat using the R sample function.

Answer

x <- c(0,1,2,3,4)
p <- c(.1,.2,.2,.2,.3)
cp <- cumsum(p)
m <- 1000
r <- numeric(m);
#find all indexes of x we want under the condition we design: F(x[i-1])<u<=F(x[i])??then x=x[i]
r <- x[findInterval(runif(m),cp)+1]
ct <- as.vector(table(r))
ct
#caculate the probabilty in the random sample we generate
tp <- ct/sum(ct)
ratio <- tp/p
data.frame(x,p,tp,ratio)
# #the relative frequency table comparing the empirical with the theoretical probabilities.

Next we repeat using the R sample function.

set.seed(1234)
r2 <- sample(c(0,1,2,3,4),m,replace=TRUE,prob=c(.1,.2,.2,.2,.3))
ct2 <- as.vector(table(r2))
tp2 <- ct2/sum(ct2)
ratio <- tp2/p
data.frame(x,p,tp2,ratio)
#R sample function

Question 2

Write a function to generate a random sample of size n from the $Beta(a, b)$ distribution by the acceptance-rejection method. Generate a random sample of size 1000 from the $Beta(3,2)$ distribution. Graph the histogram of the sample with the theoretical $Beta(3,2)$ density superimposed.

Answer

$Beta(3,2)$ with pdf $f(x)=12x^2(1-x),~0<x<1$. Let $c=12$ and $g(x)=x,~0<x<1$.

n <- 1000
j<-k<-0
y <- numeric(n)
while (k < n) 
  {
    u <- runif(1)
    j <- j + 1
    x <- runif(1) #random variate from g
    if (x^2 * (1-x) > u) 
      {
        #we accept x
        k <- k + 1
        y[k] <- x
      }
   }
j
# #(experiments) for n random numbers
hist(y, prob = TRUE, main = expression(f(x)==12*x^2*(1-x)))
x <- seq(0, 1, .001)
lines(x, 12*x^2*(1-x),col="red")
# #Histogram of the sample with the theoretical $Beta(3,2)$ density superimposed,which shows that the random sample generated fits the theoretical model $Beta(3,2)$ well. 

Question 3

Simulate a continuous Exponential-Gamma mixture. Suppose that the rate parameter $??$ has $Gamma(r, b)$ distribution and $Y$ has $Exp(??)$ distribution. That is, $(Y |?? = ??) ?? fY (y|??) = ??e^{-??y}$. Generate 1000 random observations from this mixture with $r = 4$ and $b = 2$.

Answer

library(evir)
n <- 1000; r <- 4; b <- 2
lambda <- rgamma(n, r, b) #lambda is random
#generate the mixture  
x <- rexp(n, lambda)
#compare the mixture with the GPD
hist(x,prob=TRUE,col="grey",main="Density of GPD(0.25,0,0.5)")
y <- seq(0, 100, .001)
lines(y,1/2^4*(1/2+1/4*y)^(-5),col="red") #the density function of generalized Pareto distribution:1/2^4*(1/2+1/4*x)^(-5)

# #The figure above shows that the random model generated is good especially when x extends to right further.

    ```

```r
#the theoretical model: generalized Pareto distribution
y1 <- rgpd(n,xi=1/r,mu=0,beta=b/r)
par(mfrow=c(1,2))
hist(x,main="Histogram of Simulation",col="pink")
hist(y1,main="Histogram of GPD",col="light green")
# #The comparing histograms shows that the simulation is good.

Question 1

Write a function to compute a Monte Carlo estimate of the (Beta)(3, 3) cdf,and use the function to estimate (F(x)) for (x = 0.1, 0.2, . . . , 0.9). Compare the estimates with the values returned by the (pbeta) function in R.

Answer

[Beta(a,b):B(a,b)=\int_0^1x^{a-1}(1-x)^{b-1}dx=E[g(X)]]

[g(x)=(1-0)x^{a-1}(1-x)^{b-1}]

MCofBeta <- function(n,t,a,b){
  x <- runif(n, 0, t)    #generate random x
  theta.hat <- mean(x^(a-1)*(1-x)^(b-1))*t
}
print(c(MCofBeta(1e4,1,3,3),beta(3,3)))   #compare the MC estimation and the theoretical value
##the result shows that the MC estimate works well

[F(x;a,b)=\frac{B(x;a,b)}{B(a,b)}] [B(x;a,b)=\int_0^tx^{a-1}(1-x)^{b-1}dx,B(a,b)=\int_0^1x^{a-1}(1-x)^{b-1}dx]

x <- y <- ratio<-c(.1,.2,.3,.4,.5,.6,.7,.8,.9)
for (i in 1:9){
  F[i] <- MCofBeta(1e4,x[i],3,3)/MCofBeta(1e4,1,3,3)  #generate F(x) based on estimation B(x;a,b)
  y[i]<-pbeta(x[i],3,3)      #theretical value
  ratio[i] <- F[i]/y[i]
}
  data.frame(F,y,ratio)     #compare
##figure above shows that the MC estimate works well,the ratio is almost equal to 1.

Question 2

The Rayleigh density ([156, (18.76)]) is [f(x) = \frac{x}{\sigma^2}e^{-x^2/(2\sigma^2)}, x\geq0, \sigma > 0.] Implement a function to generate samples from a (Rayleigh(\sigma)) distribution,using antithetic variables. What is the percent reduction in variance of (\frac{X+ X`}2) compared with (\frac{X_1+ X_2}2) for independent (X_1, X_2)?

Answer

Implement the function MC.R to generate samples from (Rayleigh(\sigma)) distribution[\theta=E_U(\frac{x^2U}{\sigma^2}e^{-(Ux)^2/(2\sigma^2)})]

MC.R <- function(x,sigma,n,antithetic) {
n <- numeric(n)
u <- runif(n/2)       #generate random u ~ U(0,1)

if (!antithetic) 
  v <- runif(n/2)     #if not antithentic then generate the other half of n from U(0,1)
else
  v <- 1 - u          #if antithentic then generate the other half of n as 1-u
u <- c(u, v)          #combine the whole u 

cdf <- numeric(length(x))
for (i in 1:length(x)) {
  g <- x[i]^2*u/(sigma^2)*exp(-(u *x[i])^2/(2*sigma^2))  #the estimate of x multipling density of Rayleigh for each x
  cdf[i] <- mean(g)   #the estimate of cdf
}
cdf
}

The cdf of Rayleigh distribution:(F(x;\sigma)=1-e^{-x^2/{2\sigma^2}},x\geq0)

set.seed(1234)
sigma <- 1;n <- 1e4
x <- R <- seq(.1, 2.5,length=5) #value the x
for(i in 1:length(x))
  R[i] <- 1-exp(-x[i]^2/(2*sigma^2))   #The theoretical cdf of Rayleigh

  MC1 <- MC.R(x,sigma,n,antithetic = FALSE)  #(x1+x2)/2 with independent x1,x2
  MC2 <- MC.R(x,sigma,n,antithetic = TRUE)   #(x+x`)/2 with negatively correlated x,x`

print(round(rbind(x, MC1, MC2, R), 5))
##compare the estimate with or without antithetic variables and the theoretical value,which shows that both MC1 and MC2 fit well while MC2 fits better.

Next compare the variance and calculate the percent of reduction from MC1 to MC2

set.seed(1234)
m <- 1e2;sigma <- 1;x <- 6;n <- 1e4
MC1 <- MC2 <- numeric(m)
for (i in 1:m) {
MC1[i] <- MC.R(x,sigma,n,antithetic = FALSE)
MC2[i] <- MC.R(x,sigma,n,antithetic = TRUE)
}
var1 <- var(MC1)
var2 <- var(MC2)
ratio <- (var1-var2)/var1   #the percent of reduction
print(c(var1,var2,ratio))

Question 3

Find two importance functions f1 and f2 that are supported on ((1, \infty)) and are ‘close’ to [g(x) = \frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}, x > 1.] Which of your two importance functions should produce the smaller variance in estimating [\int_1^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx] by importance sampling? Explain.

Answer

[g(x) = \frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}, x > 1.] [f_1(x)=\frac1{x^2},x>1;f_2(x)=\frac{x}{\sqrt{2\pi}}e^{(1-x^2)/4},x>1]

 x <- seq(1,7,.01)
w<-2
g <- exp(-x^2/2)*x^2/sqrt(2*pi) 
f1 <- 1/(x^2)
f2 <- x*exp((1-x^2)/4)/sqrt(2*pi)
gs <- c(expression(g(x)==e^{-x^2/2}*x^2/sqrt(2*pi)),expression(f[1](x)==1/(x^2)),expression(f[2](x)==x*e^{(1-x^2)/4}/sqrt(2*pi)))
    #for color change lty to col
    par(mfrow=c(1,2))
    #figure (a)
    plot(x, g, type = "l", ylab = "", ylim = c(0,2),lwd = w,col=1,main='(A)')
    lines(x, f1, lty = 2, lwd = w,col=2)
    lines(x, f2, lty = 3, lwd = w,col=3)
    legend("topright", legend = gs,
           lty = 1:3, lwd = w, inset = 0.02,col=1:3)

    #figure (b)
   plot(x, g/f1,  type = "l",ylim = c(0,3.2),ylab = "",lty = 2, lwd = w,col=2)
    lines(x, g/f2, lty = 3, lwd = w,col=3)
    legend("topright", legend = gs[-1],
           lty = 2:3, lwd = w, inset = 0.02,col=2:3)
##The figures above show that f2 is more similiar to g according to figure(A), and it is also more aclinic than f1 in figure(B).f2 is the better estimate than f1.
g <- function(x) {x^2*exp(-x^2/2)/sqrt(2*pi)*(x>1)}
m <- 10000   #size
theta.hat <- se <- numeric(2)
#using inverse transform method to generate random sample f1
u <- runif(m) 
x <- 1/(1-u)
fg <- g(x)*x^2
theta.hat[1] <- mean(fg)
se[1] <- sd(fg)           #sigma for f1

#using inverse transform method to generate random sample f2
u <- runif(m)
x <- (4*(1-log((1-u)*sqrt(2*pi)/2)))^0.5
fg <- g(x)/(x*exp((1-x^2)/4)/sqrt(2*pi))
theta.hat[2] <- mean(fg)
se[2] <- sd(fg)
rbind(theta.hat,se)
##As we have inferred in the figures, the data also prensents that f2 is the better function to get closer to g with a smaller variance.

Question 4

Obtain a Monte Carlo estimate of [\int_1^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx] by importance sampling.

Answer

We choose the f2 as the f to get close to g according to question 3. [f(x)=\frac{x}{\sqrt{2\pi}}exp{(1-x^2)/4}] [\int_1^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx=E(g(x)/f(x))]

g <- function(x) {x^2*exp(-x^2/2)/sqrt(2*pi)*(x>1)}
m <- 1e7                  #size
u <- runif(m)             #generate the random u ~U(0,1)
x <- (4*(1-log((1-u)*sqrt(2*pi)/2)))^0.5       #using inverse transform method to generate random sample f
fg <- g(x)/(x*exp((1-x^2)/4)/sqrt(2*pi))
theta.hat <- mean(fg)     #the estimate of E(g(x)/f(x)),also the estimate of integral
theta.hat

Question 1

Let X be a non-negative random variable with $?? = E[X] < \infty$. For a random sample $x_1, . . . , x_n$ from the distribution of X, the Gini ratio is defined by$$G=\frac1{2n^2\mu}\sum_{j=0}^n\sum_{i=1}^n|x_i-x_j|.$$ The Gini ratio is applied in economics to measure inequality in income distribution (see e.g. [163]). Note that G can be written in terms of the order statistics $x_{(i)}$ as $$G=\frac1{n^2\mu}\sum_{i=1}^n(2i-n-1)x_{i}$$ If the mean is unknown, let $\hat{G}$ be the statistic G with $\mu$ replaced by $\bar{x}$. Estimate by simulation the mean, median and deciles of $\hat{G}$ if X is standard lognormal.Repeat the procedure for the uniform distribution and Bernoulli(0.1). Also construct density histograms of the replicates in each case.

Answer

GiniEstimation <-  function(cdf,mu,n,m,a,b)  # cdf for the supposed distribution,mu for theoritical expectation,n for size of random numbers,m for repeating times, a and b for the better xlim for histogram(u can choose a better one when u observed its distribution in histogram with random a b )
{
Gcdf<-numeric(m)
set.seed(1234)
v <- numeric(n)

for(i in 1:m)  
  {
  x<-sort(cdf(n))     #sort the x

  for(j in 1:n)   
    {
    v[j] <- (2*j-n-1)*x[j]
  } 
  s <- sum(v)          
  Gcdf[i]<-(1/(n^2*mu))*s   #caculate the G for every i
}
mean<-mean(Gcdf)
median<-quantile(Gcdf,0.5)
deciles<-quantile(Gcdf,probs=seq(0,1,0.1))
print(mean)
print(median)
print(deciles)
hist(Gcdf,freq=F,xlim=c(a,b),main="Gini ratio of supposed distribution")  #density histograms of the replicates
}

The standard lognormal with theoritical expectation of $X$: $EX=e^{1/2}$

GiniEstimation(rlnorm,exp(0.5),1e3,1e2,0.45,0.65)
##The mean is much approaching to median as the data shows and we can also roughly conclude from the histogram.

The uniform distribution with$X\sim \mathbb{U}(0,1)$$E(X)=0.5$

GiniEstimation(runif,0.5,1e3,1e2,0.32,0.35)
##The mean is much approaching to median as the data shows and we can also roughly conclude from the histogram.

The Bernoulli(0.1) distribution with $X\sim Bernoulli(1000,0.1)$$E(X)=100$

n <- m <- 1e3
mu <- 100
Gcdf<-numeric(m)
set.seed(1234)
v <- numeric(n)

for(i in 1:m)  
  {
  x<-sort(rbinom(n,1000,0.1))     #sort the x

  for(j in 1:n)   
    {
    v[j] <- (2*j-n-1)*x[j]
  } 
  s <- sum(v)          
  Gcdf[i]<-(1/(n^2*mu))*s   #caculate the G for every i
}
mean<-mean(Gcdf)
median<-quantile(Gcdf,0.5)
deciles<-quantile(Gcdf,probs=seq(0,1,0.1))
print(mean)
print(median)
print(deciles)
hist(Gcdf,freq=F,xlim=c(0.048,0.058),main="Gini ratio of Bernoulli(0.1)")  #density histograms of the replicates
##The mean is much approaching to median as the data shows and we can also roughly conclude from the histogram.

Question 2

Construct an approximate 95% confidence interval for the Gini ratio $\gamma = E[G]$ if X is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.

Answer

First we stimulate the $\sigma=sd(logX),logX\sim normal(0,1)$,then we use the$\sigma$to caculate$\gamma$$$\gamma=E[G]=erf(\sigma/2)=\frac2{\sqrt{\pi}}\int_0^{\frac{\sigma}{2}}e^{-t^2}dt$$,then we can caculate the CI.

n<-1000              #generate 1000 random numbers
m<-100               #generate 100 gini ratio every times and repeate 100 times
alpha <- 0.05
UCL <- LCL <- numeric(m)
G <- function(b)      #the real value of gamma,b for sigma
  {

  return(2*pnorm(b/sqrt(2))-1)
}
CI.sigma<-function(x,n){                     #function to construct confident interval for sigma
  s2<-(1/(n-1))*sum((x-mean(x))^2)
  LCL<- (n-1)*s2/qchisq(1-alpha/2,n-1)
  UCL<-(n-1)*s2/qchisq(alpha/2,n-1)
  c(LCL,UCL)
}
ciG<-cbind(numeric(m),numeric(m))
C<-CR<-numeric(m)
e <- G(1)
set.seed(1234)
for(j in 1:m){
  for(i in 1:m){

  x<-rlnorm(n)
  ci<-CI.sigma(log(x),n)
  ciG[i,1]<-G(ci[1])
  ciG[i,2]<-G(ci[2])
  if(e>ciG[i,1] && e<ciG[i,2])     # for every ith ci,judge whether real value e in the ci
    C[i]<-1
  else
    C[i]<-0
   }

CR[j]<-mean(C)
}
mean(CR)
##The coverage rate of estimation is a little bigger than 0.95.

Question 3

Tests for association based on Pearson product moment correlation $\rho$, Spearman??s rank correlation coefficient $\rho_s$, or Kendall??s coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_s$ or $\tau$ are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X, Y ) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.

Answer

Assume $(X,Y)\sim N(\mu,\Sigma)$, with $\mu=(0,0)^T$ and $\Sigma$ is of following: \begin{matrix} 1 & \rho\ \rho & 1 \end{matrix}

Spearman's rank correlation:

library(MASS)
n <- 20
m <- 1000
rho.s <- c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1))   #alternatives
M <- length(rho.s)
power.s <- numeric(M)
set.seed(1234)
for (i in 1:M) {
rho <-rho.s[i]
pvalues.s <- replicate(m, expr = {       #simulate under alternative mu1
x <- mvrnorm(n, c(0,0), matrix(c(1,rho,rho,1),2,2))
cortest.s <- cor.test(x[,1],x[,2],
alternative = "two.sided",method="spearman")
cortest.s$p.value } )
power.s[i] <- mean(pvalues.s<= .05)
}
power.s

Kendall's coefficient:

n <- 20
m <- 1000
rho.k <- c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1))    #alternatives
M <- length(rho.k)
power.k <- numeric(M)
set.seed(1234)
for (i in 1:M) {
rho <-rho.k[i]
pvalues.k <- replicate(m, expr = {       #simulate under alternative mu1
x <- mvrnorm(n, c(0,0), matrix(c(1,rho,rho,1),2,2))
cortest.k <- cor.test(x[,1],x[,2],
alternative = "two.sided",method="kendall")
cortest.k$p.value } )
power.k[i] <- mean(pvalues.k<= .05)
}
power.k

Pearson's correlation:

n <- 20
m <- 1000
rho.p <- c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1))       #alternatives
M <- length(rho.p)
power.p <- numeric(M)
set.seed(1234)
for (i in 1:M) {
rho <-rho.p[i]
pvalues.p <- replicate(m, expr = {     #simulate under alternative mu1

x <- mvrnorm(n, c(0,0), matrix(c(1,rho,rho,1),2,2))
cortest.p <- cor.test(x[,1],x[,2],
alternative = "two.sided",method="pearson")
cortest.p$p.value } )
power.p[i] <- mean(pvalues.p<= .05)
}
power.p

Make comparision

power.rho<-rbind(power.s,power.k,power.p)
colnames(power.rho)<-c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1))
power.rho

the result shows that the nonparametric tests based on $??_s$ or $??$ are less powerful than the correlation test in bivariate normal.

The example $X\sim t_{2}$, $Y\sim U(-1,0)$ when $X<0$ and $Y\sim U(0,1)$ when $X\ge0$.$X,Y$ are dependent.

n <- 20
m <- 1000
set.seed(1234)
pvalues.s1 <- replicate(m, expr = {   #simulate under alternative mu1
x <- rt(n,2)
y<-numeric(n)
for(i in 1:n){
  if(x[i]<0) 
    y[i]<-runif(1,-1,0)
  else       
    y[i]<-runif(1,0,1)
}
cortest.s1 <- cor.test(x,y,
alternative = "two.sided",method="spearman")
cortest.s1$p.value } )
power.s1 <- mean(pvalues.s1<= .05)
power.s1
##nice power
n <- 20
m <- 1000
set.seed(431)
pvalues.p1 <- replicate(m, expr = {
#simulate under alternative mu1
x <- rt(n,2)
y<-numeric(n)
for(i in 1:n){
  if(x[i]<0) y[i]<-runif(1,-1,0)
  else       y[i]<-runif(1,0,1)
}
cortest.p1 <- cor.test(x,y,
alternative = "two.sided",method="pearson")
cortest.p1$p.value } )
power.p1 <- mean(pvalues.p1<= .05)
power.p1

Power of the nonparametric test Spearman's rank correlation coefficient is much better according to the results.

Question 1

Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.

Answer

This question is about the correlation between LSAT and GPA in law data. An unbiased estimate of the bias $E(\hat\theta)-\theta_0$ is$$(n-1)(\bar{\hat\theta}{(\cdot)}-\hat\theta)$$ An unbiased estimate of $se(\hat\theta)$ is$$ \sqrt{\frac{n-1}n\sum{i=1}^n(\hat\theta_{(i)}-\bar{\hat\theta}_{(\cdot)})^2}$$

#declare the varibles
library(bootstrap)
n <-nrow(law)   #sample size
theta.jack <- numeric(n)
theta.hat <- cor(law[,1], law[,2]) #law[,1] is LSAT, law[,2] is GPA
b.cor <- function(x,i)cor(x[i,1],x[i,2])
for(i in 1:n)
  {
    theta.jack[i] <- b.cor(law,(1:n)[-i])  #leave one out
}
bias.jack <- (n-1)*(mean(theta.jack)-theta.hat)     #the eatimate bias 
se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2))   #the eatimate standard error
round(c(original=theta.hat,bias=bias.jack,se=se.jack),3)
##Jackknife estimate of the bias and the standard error of the correlation between LSAT and GPA in law data is shown above. 

Question 2

Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures $\frac{1}{\lambda}$ by the standard normal, basic, percentile,and BCa methods. Compare the intervals and explain why they may differ.

Answer

the MLE of $\lambda$:$\hat\lambda=\frac{n}{\sum_{i=0}^{n}x_i}=\frac{1}{\bar{X}}$

#declare the varibles
library(boot)
n <- 100 #sample size
m <- 100# times for replicates
ci.norm<-ci.basic<-ci.perc<-ci.bca<-matrix(NA,m,2)
boot.mean <- function(x,i) mean(x$hours[i])
de <- boot(aircondit,statistic=boot.mean,R = 999)
ci <- boot.ci(de,type=c("norm","basic","perc","bca")) #the confidence interval
ci
##The four kinds of CIs have been shown above,and they differ greatly,it is may beacause of the small fixed sample size which is not appproching the normal distribution. 

Compare the four kinds of CI

set.seed(4)
mu <- mean(aircondit$hours)#the estimate of mean time
lambda <- 1/mu
tboot.mean <- function(x,i) mean(x[i])
for (i in 1:m) {
  U<-runif(n);R<--log(1-U)/lambda #generate random sample with expotential distribution
  de <- boot(data=R,statistic=tboot.mean, R = 999)
  ci <- boot.ci(de,type=c("norm","basic","perc","bca"))
  ci.norm[i,]<-ci$norm[2:3];ci.basic[i,]<-ci$basic[4:5]
  ci.perc[i,]<-ci$percent[4:5];ci.bca[i,]<-ci$bca[4:5]
}
cat('norm =',mean(ci.norm[,1]<=mu & ci.norm[,2]>=mu),
    'basic =',mean(ci.basic[,1]<=mu & ci.basic[,2]>=mu),
    'perc =',mean(ci.perc[,1]<=mu & ci.perc[,2]>=mu),
    'BCa =',mean(ci.bca[,1]<=mu & ci.bca[,2]>=mu))

##The result above showS that the percentile method has the best effect in expotential distribution with lambda equals the mle of lambda from aircondit data

Quenstion 3

Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat\theta$.

Answer

We have the empirical $\theta$$$\hat\theta=\frac{\hat\lambda_1}{\sum_{j=0}^{m}\hat\lambda_j}$$m is the column number of data,$\hat\lambda_j$is the empirical sorted eigenvalue by decreasing order of covariance matrix.

#declare the varibles
library(bootstrap)
n <- nrow(scor)
m <- ncol(scor)
covmatrix <- matrix(NA,m,m)# for covariance matrix
eigenvalues <- lambda <- numeric(m) #for eigenvalues,sorted eigenvalues(decreasing)
covmatrix <- cov(scor) #covariance matrix
eigenvalues <- eigen(covmatrix)$values #eigenvalues
lambda <- sort(eigenvalues) #sort the eigenvalues
theta.hat <- lambda[1]/sum(lambda) #the empirical theta 
b.theta <- function(x,i) 
             {
               covmatrix <- cov(x[i,])
               eigenvalues <- eigen(covmatrix)$values
               lambda <- sort(eigenvalues)
               theta.hat <- lambda[1]/sum(lambda)
             }
for(i in 1:n)
  {
    theta.jack[i] <- b.theta(scor,(1:n)[-i])   #leave one out
  }
bias.jack <- (n-1)*(mean(theta.jack)-theta.hat)     #the eatimate bias 
se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2))   #the eatimate standard error
round(c(original=theta.hat,bias=bias.jack,se=se.jack),3)
##Jackknife estimate of the bias and the standard error of the theta.hat is shown above. 

Quenstion 4

In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.

Answer

#declare the varibles
library(DAAG)
attach(ironslag)
n <- length(magnetic) #in DAAG ironslag
e1 <- e2 <- e3 <- e4 <- numeric(n) # for the prediction error
# for n-fold cross validation
# fit models on leave-two-out samples
for (k in 1:round(n/2))
  {
y <- magnetic[-(2*k-1)] #delete the x_2k-1
y <- y[-2*k]          #delete the x_2k
x <- chemical[-2*k-1]
x <- x[-2*k]

J1 <- lm(y ~ x) #Linear model
yhat1.1 <- J1$coef[1] + J1$coef[2] * chemical[2*k-1]
e1[2*k-1] <- magnetic[2*k-1] - yhat1.1
yhat1.2 <- J1$coef[1] + J1$coef[2] * chemical[2*k]
e1[2*k] <- magnetic[2*k] - yhat1.2

J2 <- lm(y ~ x + I(x^2)) #Quardratic model
yhat2.1 <- J2$coef[1] + J2$coef[2] * chemical[2*k-1] +
J2$coef[3] * chemical[2*k-1]^2
e2[2*k-1] <- magnetic[2*k-1] - yhat2.1
yhat2.2 <- J2$coef[1] + J2$coef[2] * chemical[2*k] +
J2$coef[3] * chemical[2*k]^2
e2[2*k] <- magnetic[2*k] - yhat2.2

J3 <- lm(log(y) ~ x) #Exponential model
logyhat3.1 <- J3$coef[1] + J3$coef[2] * chemical[2*k-1]
yhat3.1 <- exp(logyhat3.1)
e3[2*k-1] <- magnetic[2*k-1] - yhat3.1
logyhat3.2 <- J3$coef[1] + J3$coef[2] * chemical[2*k]
yhat3.2 <- exp(logyhat3.2)
e3[2*k] <- magnetic[2*k] - yhat3.2

J4 <- lm(log(y) ~ log(x)) #Log-Log model
logyhat4.1 <- J4$coef[1] + J4$coef[2] * log(chemical[2*k-1])
yhat4.1 <- exp(logyhat4.1)
e4[2*k-1] <- magnetic[2*k-1] - yhat4.1
logyhat4.2 <- J4$coef[1] + J4$coef[2] * log(chemical[2*k])
yhat4.2 <- exp(logyhat4.2)
e4[2*k] <- magnetic[2*k] - yhat4.2
}
c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))
##The result above shows that the J2 Model 2, the quadratic model,would be the best fit for the data according to the prediction error criterion.
J2
##The related quadratic model is shown above

The model is$$\hat Y=19.15533-0.77001X+0.03697X^2$$

Question 1

Implement the two-sample Cramer-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.

Answer

The Cram′er-von Mises statistic[W_2=\frac{mn}{(m+n)^2}[\sum_{i=1}^{n}(F_n(x_i)-G_m(x_i))^2+\sum_{j=1}^{n}(F_n(y_j)-G_m(y_j))^2]]]

#declare the varibles
attach(chickwts)    #for data
x <- sort(as.vector(weight[feed == "soybean"]))
y <- sort(as.vector(weight[feed == "linseed"]))
detach(chickwts)

z <- c(x,y) #pool the sample
m <- length(x);n <- length(y);N <- length(z)
S <- numeric(N) #for Cram′er-von Mises statistic
R<- 999 #number of replicates
W <- k <- numeric(R) #W for storage of replicates k for permutation
#caculate the Cram′er-von Mises statistic
cvm <- function(x1,y1){
Fn <- ecdf(x1);Gm <- ecdf(y1);z1 <- c(x1,y1)
for (i in 1:N) 
  {
    S[i] <- (Fn(z1[i])-Gm(z1[i]))^2 
  }
  m*n/(m+n)^2*sum(S)
  }
W0 <- cvm(x,y)
for (i in 1:R) {
#generate indices k for the first sample
k <- sample(1:N, size = m, replace = FALSE)
x1 <- z[k]
y1 <- z[-k] #complement of x1
W[i] <- cvm(x1,y1)
}
p <- mean(c(W0, W) >= W0)#cacaulate the p value
p
## As the result shows above,it does not support the alternative hypothesis that distributions differ.

Question 2

Design experiments for evaluating the performance of the NN, energy, and ball methods in various situations.

(1) Unequal variances and equal expectations

(2) Unequal variances and unequal expectations

(3) Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)

(4) Unbalanced samples (say, 1 case versus 10 controls)

Note: The parameters should be choosen such that the powers are distinguishable (say, range from 0.3 to 0.9).

Anwer

#declare the varibles
library(RANN)
library(energy)
library(Ball)
library(boot)
m <- 103; k<-3; p<-2; mu <- 0.5; set.seed(12345)
n1 <- n2 <- 50; R<-999; n <- n1+n2; N = c(n1,n2)
Tn <- function(z, ix, sizes,k) {
n1 <- sizes[1]
n2 <- sizes[2]
n <- n1 + n2
z <- z[ix, ]
o <- rep(0, NROW(z))
z <- as.data.frame(cbind(z, o))
NN <- nn2(z, k=k+1)
block1 <- NN$nn.idx[1:n1,-1]
block2 <- NN$nn.idx[(n1+1):n,-1]
i1 <- sum(block1 < n1 + .5)
i2 <- sum(block2 > n1 + .5)
return((i1 + i2) / (k * n))
}
eqdist.nn <- function(z,sizes,k){
  boot.obj <- boot(data=z,statistic=Tn,R=R,
  sim = "permutation", sizes = sizes,k=k)
  ts <- c(boot.obj$t0,boot.obj$t)
  p.value <- mean(ts>=ts[1])
  list(statistic=ts[1],p.value=p.value)
}
p.values <- matrix(NA,m,3)

#Unequal variances and equal expectations
for(i in 1:m){
  x <- matrix(rnorm(n1*p),ncol=p);
  y <- cbind(rnorm(n2),rnorm(n2,mean=mu));
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value  #NN
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value  #energy method
  p.values[i,3] <-
    bd.test(x=x,y=y,R=R,seed=i*12345)$p.value  #ball method
}
alpha <- 0.05                        #confidence level 
pow <- colMeans(p.values<alpha)
pow
#Unequal variances and unequal expectations
mu <- 0.6
for(i in 1:m){
  x <- matrix(rnorm(n1*p),ncol=p);
  y <- cbind(rnorm(n2),rnorm(n2,mean=mu));
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value  #NN
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value  #energy method
  p.values[i,3] <-
    bd.test(x=x,y=y,R=R,seed=i*12345)$p.value  #ball method
}
alpha <- 0.05                        #confidence level 
pow <- colMeans(p.values<alpha)
pow
#Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
for(i in 1:m){
  x <- cbind(rt(n1,1),rnorm(n2, mean = sample(c(0, 1), size = n2, prob = c(0.5, 0.5), replace = T), sd =1 ))
  y <- cbind(rt(n2,1),rnorm(n2, mean = sample(c(1, 2), size = n2, prob = c(0.5, 0.5), replace = T), sd =1 ))
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value  #NN
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value  #energy method
  p.values[i,3] <-
    bd.test(x=x,y=y,R=R,seed=i*12345)$p.value  #ball method
}
alpha <- 0.05                        #confidence level 
pow <- colMeans(p.values<alpha)
pow
#Unbalanced samples (say, 1 case versus 10 controls)
n1 <- 10;n2 <- 1e2;N = c(n1,n2)
for(i in 1:m){
  x <- matrix(rnorm(n1*p),ncol=p);
  y <- cbind(rnorm(n2),rnorm(n2,mean=mu));
  z <- rbind(x,y)
  p.values[i,1] <- eqdist.nn(z,N,k)$p.value  #NN
  p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value  #energy method
  p.values[i,3] <-
    bd.test(x=x,y=y,R=R,seed=i*12345)$p.value  #ball method
}
alpha <- 0.05                        #confidence level 
pow <- colMeans(p.values<alpha)
pow

Question 3

Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchy distribution (see qcauchy or qt with df=1). Recall that a $Cauchy(\theta, \eta)$ distribution has density function $$ f(x) =\frac{1} {\theta\pi(1 + [(x- \eta)/\theta]^2)}, - \infty < x < \infty, \theta>0. $$ The standard Cauchy has the $Cauchy(\theta= 1, \eta = 0)$ density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)

Answer

#declare the varibles
library(stats)
theta <- 1;eta <- 0
m <- 10000
x <- numeric(m)
f <- function(x, theta,eta) {
if (theta > 0) 
return(1/(theta*pi*(1+((x-eta)/theta)^2)))
}

x[1] <- runif(1,min=-1,max=1)
k <- 0
u <- runif(m)
for (i in 2:m) {
xt <- x[i-1]
y <- xt+runif(1,min=-1,max=1) #proposal distribution
num <- f(y, theta,eta) * dnorm(xt, mean = y,sd=0.5)

den <- f(xt, theta,eta) * dnorm(y, mean = xt,sd=0.5)

if (u[i] <= num/den) x[i] <- y 
else {
x[i] <- xt
k <- k+1 #y is rejected
}
}
print(k)

Compare the deciles of the generated observations with the deciles of the standard Cauchy distribution

b <- 1001 #discard the burnin sample
y <- x[b:m]
a <- ppoints(100)
QC <- qcauchy(a) #quantiles of Cauchy
Q <- quantile(x, a)
qqplot(QC, Q, main="",xlim=c(-2,2),ylim=c(-2,2),xlab="Cauchy Quantiles", ylab="Sample Quantiles")
hist(y, breaks="scott", main="", xlab="", freq=FALSE)
lines(QC, f(QC,theta,eta))

Question 4

Rao $[220, Sec. 5g]$ presented an example on genetic linkage of 197 animals in four categories (also discussed in $[67, 106, 171, 266])$. The group sizes are $(125, 18, 20, 34)$. Assume that the probabilities of the corresponding multinomial distribution are $$ (\frac12+\frac{\theta}{4},\frac{1-\theta}4,\frac{1-\theta}4,\frac{\theta}4). $$ Estimate the posterior distribution of $\theta$ given the observed sample, using one of the methods in this chapter.

Answer

#declare the varibles
gsize <- c(125,18,20,34)  #group size
m <- 5000
w <- .25 
burn <- 1000
#the target density
prob <- function(y, gsize) {
  if (y < 0 || y >1)
    return (0)
  else
  return((1/2+y/4)^gsize[1] *((1-y)/4)^gsize[2]*((1-y)/4)^gsize[3]*(y/4)^gsize[4])
}

u <- runif(m)  #for accept/reject step
v <- runif(m, -w, w)  #proposal distribution
x[1] <- .25
for (i in 2:m) {
  y <- x[i-1] + v[i]
  if (u[i] <= prob(y, gsize) / prob(x[i-1], gsize))
    x[i] <- y 
  else
    x[i] <- x[i-1]
}
xtheta <- x[(burn+1):m]
theta.hat <- mean(xtheta)
theta.hat
##the estimate of posterior distribution of θ is shown above.

Question 4

Rao [220, Sec. 5g] presented an example on genetic linkage of 197 animals in four categories (also discussed in [67, 106, 171, 266]). The group sizes are 278 Statistical Computing with R (125, 18, 20, 34). Assume that the probabilities of the corresponding multinomial distribution are$$(\frac{1}{2}+\frac{\theta}{4},\frac{1-\theta}{4},\frac{1-\theta}{4},\frac{\theta}{4}).$$ Estimate the posterior distribution of theta given the observed sample, using one of the methods in this chapter.

Answer

#declare the varibles
gsize <- c(125,18,20,34)  #group size
m <- 5000
w <- .25 
burn <- 1000
x <- numeric(m)
#the target density
prob <- function(y, gsize) {
  if (y < 0 || y >1)
    return (0)
  else
  return((1/2+y/4)^gsize[1] *((1-y)/4)^gsize[2]*((1-y)/4)^gsize[3]*(y/4)^gsize[4])
}

u <- runif(m)  #for accept/reject step
v <- runif(m, -w, w)  #proposal distribution
x[1] <- .25
for (i in 2:m) {
  y <- x[i-1] + v[i]
  if (u[i] <= prob(y, gsize) / prob(x[i-1], gsize))
    x[i] <- y 
  else
    x[i] <- x[i-1]
}
x
xtheta <- x[(burn+1):m]
theta.hat <- mean(xtheta)
theta.hat
##the estimate of posterior distribution of theta is shown above.

Question 1

Write a function to compute the cdf of the Cauchy distribution, which has density$$\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},-\infty0$. Compare your results to the results from the R function pcauchy.(Also see the source code in pcauchy.c.)

Answer

#declare the varibles
q <- c(-222,-22,-2,0,4,44,444,Inf)
m <-length(q)
CDF <- numeric(m) #for CDF with different p
eta <- 0;theta <- 1 #the location parameter and the scale parameter of Cauchy density
f <- function(x, eta, theta) {
1/(1+((x-eta)/theta)^2)/theta/pi
}

for (i in 1:m ){

CDF[i] <- integrate(f,lower=-Inf,upper=q[i],rel.tol=.Machine$double.eps^0.25,eta=eta,theta=theta)$value
}
pcauchy <- pcauchy(q,location = eta, scale = theta)
data.frame(q,pcauchy,CDF)
##As the result shows above, the caculated CDF is the same to the results of pcauchy function with different q.

Question 2

A-B-O blood type problem (\blacktriangleright) Let the three alleles be A, B, and O.

Genotype AA (~~) BB (~~) OO (~~) AO (~~) BO (~~) AB

Frequency p2 (~~) q2 (~~) r2 (~~) 2pr (~~) 2qr (~~) 2pq (~~) 1

Count (~~~~~n_{AA}) (n_{BB}) (n_{OO}) (n_{AO}) (n_{BO}) (n_{AB}~~) n

(\blacktriangleright) Observed data: (n_{A??} = n_{AA} + n_{AO} = 28 ~~~(A-type),)

(n_{B??} = n_{BB} + n_{BO} = 24 ~~(B-type), n_{OO} = 41~~ (O-type), n_{AB} = 70 ~~(AB-type).)

(\blacktriangleright) Use EM algorithm to solve MLE of p and q (consider missing data (n_{AA} and n_{BB})).

(\blacktriangleright) Record the maximum likelihood values in M-steps, are they increasing?

Answer

#declare the varibles
nA <- 28;nB <- 24;nAB <- 70;nOO <- 41 #observed data
L <- c(0.3,0.2) #initial values for p q 
tol <- .Machine$double.eps^0.5 # for caculation accuracy
n <- 10000 #max. number of iterations
EI <- M <- numeric(n)# for the values of "M-step"
p <- L[1];q <- L[2]   #initial values
for (i in 1:n) {
  a<-p/(2-p-2*q)
  b<-q/(2-2*p-q)
  fp=nA*(1+a)+nAB
  fq=nB*(1+b)+nAB
  fpq=nA*(1-a)+nB*(1-b)+2*nOO

  M[i]<-fp*log(p)+fpq*log(1-p-q)+fq*log(q) #Record the    maximum likelihood values in M-steps

  x<-p; y<-q                #store ith values of p and q
  p<-fp/(fp+fq+fpq); q<-fq/(fp+fq+fpq) #update the parameters
  #k<-k+1     #the times until converge
  if (abs(p-x)/x<tol && abs(q-y)/y<tol) break  #control the iterations
  p.hat<-p; q.hat<-q

}
EI<-M[1:i]#the values of "M-step"
data.frame(p.hat,q.hat)
##the estimates of p q is shown above.
i ##the times until converge
-EI 
x <- seq(1,i,1)
plot(x,EI)
##the maximum likelihood values in M-steps are  increasing

Question 1

Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list:

formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)

Answer

For loops

 out_formulas <- vector('list', length(formulas))
   for(i in seq_along(formulas)) {
       out_formulas[[i]] <- lm(formulas[[i]], data = mtcars)
   }
   out_formulas

lapply()

lapply(formulas, lm, data = mtcars)

Question 2

Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply().Can you do it without an anonymous function?

bootstraps <- lapply(1:10, function(i) {
rows <- sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
})

Answer

for loops

out_bootstrap <- vector('list', length(bootstraps))
   for(i in seq_along(bootstraps)) {
       out_bootstrap[[i]] <- lm(mpg~disp, data = bootstraps[[i]])
   }
   out_bootstrap

lapply()

 lapply(bootstraps, lm, formula = mpg~disp)

Question 3

For each model in the previous two exercises, extract R2 using the function below.

rsq <- function(mod) summary(mod)$r.squared

Answer

lapply(out_formulas, rsq)
lapply(out_bootstrap, rsq)

Question 4

The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.

trials <- replicate(
100,
t.test(rpois(10, 10), rpois(7, 10)),
simplify = FALSE
)

Extra challenge: get rid of the anonymous function by using [[ directly.

Answer

sapply(trials, function(mod) mod$p.value)

Extra challenge:

sapply(trials, `[[`, 'p.value')

Question 5

Implement a combination of Map() and vapply() to create an lapply() variant that iterates in parallel over all of its inputs and stores its outputs in a vector (or a matrix). What arguments should the function take?

Answer

 library(parallel)
   mcvMap <- function(f, FUN.VALUE , ...) {
       out <- mcMap(f, ...)
       vapply(out, identity, FUN.VALUE)
   }

Question 1

Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test()or by coding from the mathematical definition(http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test).

Answer

| n_11 | n_12 | n_13 | n_14 | |------|------|------|------| | n_21 | n_22 | n_23 | n_24 | the $X^2=\sum_{i=1}^{2}\sum_{j=1}^{4}(n_ij-n_{i??}n_{??j}/n)^2/(n_{i??}n_{??j}/n)$

expected <- function(colsum, rowsum, total) {
  (colsum / total) * (rowsum / total) * total
}                      #for $n_{i??}n_{??j}/n$

chi_stat <- function(observed, expected) {
  ((observed - expected) ^ 2) / expected
}


chisq_test2 <- function(x, y) {#x for the first row, y for the second row
  total <- sum(x) + sum(y)
  rowsum_x <- sum(x)
  rowsum_y <- sum(y)
  chistat <- 0
  for (i in seq_along(x)) {
    colsum <- x[i] + y[i]
    expected_x <- expected(colsum, rowsum_x, total)
    expected_y <- expected(colsum, rowsum_y, total)
    chistat <- chistat + chi_stat(x[i], expected_x) 
    chistat <- chistat + chi_stat(y[i], expected_y)
  }
  chistat
}
#examples of the use of the new chisq_test2

print(chisq_test2(seq(3, 8), seq(4, 9)))

print(chisq.test(seq(3, 8), seq(4, 9)))

##there is a doubt that the value of chisq from the new chisq_test is much different from the chisq.test function

Question 2

Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?

Answer

table2 <- function(x, y) {
  x_val <- unique(x)
  y_val <- unique(y)
  mat <- matrix(0L, length(x_val), length(y_val))
  for (i in seq_along(x)) {
    mat[which(x_val == x[[i]]), which(y_val == y[[i]])] <-
      mat[which(x_val == x[[i]]),  which(y_val == y[[i]])] + 1L
  }
  dimnames <- list(x_val, y_val)
  names(dimnames) <- as.character(as.list(match.call())[-1])  # R has names for dimnames... :/
  tab <- array(mat, dim = dim(mat), dimnames = dimnames)
  class(tab) <- "table"
  tab
}
#examples of the use of the new table2()
a <- c(1, 2, 3)
identical(table(a, a), table2(a, a))


b <- c(2, 3, 4)
identical(table(a, b), table2(a, b))


c <- c(1, 2, 3, 1, 2, 3)
d <- c(2, 3, 4, 2, 3, 4)
identical(table(c, d), table2(c, d))


e <- c(1, 2, 2)
identical(table(a, e), table2(a, e))

identical(table(b, e), table2(b, e))

identical(table(e, e), table2(e, e))


f <- c(1, 1, 1)
identical(table(f, f), table2(f, f))

identical(table(e, f), table2(e, f))


g <- c(1, 4, 9)
identical(table(g, g), table2(g, g))

identical(table(g, f), table2(g, f))

apply table2() to chisq_test2

expected <- function(colsum, rowsum, total) {
  (colsum / total) * (rowsum / total) * total
}                      #for $n_{i??}n_{??j}/n$

chi_stat <- function(observed, expected) {
  ((observed - expected) ^ 2) / expected
}
x <- y <- NULL
z <- table2(x,y)
chisq_test3 <- function(z) {#x for the first row, y for the second row
  total <- sum(x) + sum(y)
  rowsum_x <- sum(x)
  rowsum_y <- sum(y)
  chistat <- 0
  for (i in seq_along(x)) {
    colsum <- x[i] + y[i]
    expected_x <- expected(colsum, rowsum_x, total)
    expected_y <- expected(colsum, rowsum_y, total)
    chistat <- chistat + chi_stat(x[i], expected_x) 
    chistat <- chistat + chi_stat(y[i], expected_y)
  }
  chistat
}

compare the time

x <- seq(3, 8)
y <- seq(4, 9)
system.time(print((chisq_test2(x, y))))

system.time(print(chisq_test3(table2(x, y))))
##we can see that the new table function dose speed up the new chisq_test function


peachandbeach/StatComp18069 documentation built on May 5, 2019, 11:07 p.m.