Write a .Rmd file to implement at least three examples of different types in the above books (texts, numerical results, tables, and figures).
patientID <- c(1,2,3,4) age <- c(25,34,28,52) diabetes <- c("Type1","Type2","Type1","Type1") status <- c("Poor","Improved","Excellent","Poor") patientdata <- data.frame(patientID,age,diabetes,status) patientdata table(patientdata$diabetes,patientdata$status)
mpg <- c(39.4,36.4,33.8,31.5,29.6,27.8,26.3,24.9) wt <- c(1897,1665,1559,1550,1490,1500,1480,1380) lm(mpg~wt) lmfit <- lm(mpg~wt) summary(lmfit) plot(lmfit)
A discrete random variable $X$ has probability mass function
|$x$|0|1|2|3|4| |-|-|-|-|-|-| |$p(x)$|0.1|0.2|0.2|0.2|0.3|
Use the inverse transform method to generate a random sample of size 1000 from the distribution of $X$. Construct a relative frequency table and compare the empirical with the theoretical probabilities. Repeat using the R sample function.
x <- c(0,1,2,3,4) p <- c(.1,.2,.2,.2,.3) cp <- cumsum(p) m <- 1000 r <- numeric(m); #find all indexes of x we want under the condition we design: F(x[i-1])<u<=F(x[i])??then x=x[i] r <- x[findInterval(runif(m),cp)+1] ct <- as.vector(table(r)) ct #caculate the probabilty in the random sample we generate tp <- ct/sum(ct) ratio <- tp/p data.frame(x,p,tp,ratio) # #the relative frequency table comparing the empirical with the theoretical probabilities.
Next we repeat using the R sample function.
set.seed(1234) r2 <- sample(c(0,1,2,3,4),m,replace=TRUE,prob=c(.1,.2,.2,.2,.3)) ct2 <- as.vector(table(r2)) tp2 <- ct2/sum(ct2) ratio <- tp2/p data.frame(x,p,tp2,ratio) #R sample function
Write a function to generate a random sample of size n from the $Beta(a, b)$ distribution by the acceptance-rejection method. Generate a random sample of size 1000 from the $Beta(3,2)$ distribution. Graph the histogram of the sample with the theoretical $Beta(3,2)$ density superimposed.
$Beta(3,2)$ with pdf $f(x)=12x^2(1-x),~0<x<1$. Let $c=12$ and $g(x)=x,~0<x<1$.
n <- 1000 j<-k<-0 y <- numeric(n) while (k < n) { u <- runif(1) j <- j + 1 x <- runif(1) #random variate from g if (x^2 * (1-x) > u) { #we accept x k <- k + 1 y[k] <- x } } j # #(experiments) for n random numbers
hist(y, prob = TRUE, main = expression(f(x)==12*x^2*(1-x))) x <- seq(0, 1, .001) lines(x, 12*x^2*(1-x),col="red") # #Histogram of the sample with the theoretical $Beta(3,2)$ density superimposed,which shows that the random sample generated fits the theoretical model $Beta(3,2)$ well.
Simulate a continuous Exponential-Gamma mixture. Suppose that the rate parameter $??$ has $Gamma(r, b)$ distribution and $Y$ has $Exp(??)$ distribution. That is, $(Y |?? = ??) ?? fY (y|??) = ??e^{-??y}$. Generate 1000 random observations from this mixture with $r = 4$ and $b = 2$.
library(evir) n <- 1000; r <- 4; b <- 2 lambda <- rgamma(n, r, b) #lambda is random #generate the mixture x <- rexp(n, lambda) #compare the mixture with the GPD hist(x,prob=TRUE,col="grey",main="Density of GPD(0.25,0,0.5)") y <- seq(0, 100, .001) lines(y,1/2^4*(1/2+1/4*y)^(-5),col="red") #the density function of generalized Pareto distribution:1/2^4*(1/2+1/4*x)^(-5) # #The figure above shows that the random model generated is good especially when x extends to right further. ``` ```r #the theoretical model: generalized Pareto distribution y1 <- rgpd(n,xi=1/r,mu=0,beta=b/r) par(mfrow=c(1,2)) hist(x,main="Histogram of Simulation",col="pink") hist(y1,main="Histogram of GPD",col="light green") # #The comparing histograms shows that the simulation is good.
Write a function to compute a Monte Carlo estimate of the (Beta)(3, 3) cdf,and use the function to estimate (F(x)) for (x = 0.1, 0.2, . . . , 0.9). Compare the estimates with the values returned by the (pbeta) function in R.
[Beta(a,b):B(a,b)=\int_0^1x^{a-1}(1-x)^{b-1}dx=E[g(X)]]
[g(x)=(1-0)x^{a-1}(1-x)^{b-1}]
MCofBeta <- function(n,t,a,b){ x <- runif(n, 0, t) #generate random x theta.hat <- mean(x^(a-1)*(1-x)^(b-1))*t } print(c(MCofBeta(1e4,1,3,3),beta(3,3))) #compare the MC estimation and the theoretical value ##the result shows that the MC estimate works well
[F(x;a,b)=\frac{B(x;a,b)}{B(a,b)}] [B(x;a,b)=\int_0^tx^{a-1}(1-x)^{b-1}dx,B(a,b)=\int_0^1x^{a-1}(1-x)^{b-1}dx]
x <- y <- ratio<-c(.1,.2,.3,.4,.5,.6,.7,.8,.9) for (i in 1:9){ F[i] <- MCofBeta(1e4,x[i],3,3)/MCofBeta(1e4,1,3,3) #generate F(x) based on estimation B(x;a,b) y[i]<-pbeta(x[i],3,3) #theretical value ratio[i] <- F[i]/y[i] } data.frame(F,y,ratio) #compare ##figure above shows that the MC estimate works well,the ratio is almost equal to 1.
The Rayleigh density ([156, (18.76)]) is [f(x) = \frac{x}{\sigma^2}e^{-x^2/(2\sigma^2)}, x\geq0, \sigma > 0.] Implement a function to generate samples from a (Rayleigh(\sigma)) distribution,using antithetic variables. What is the percent reduction in variance of (\frac{X+ X`}2) compared with (\frac{X_1+ X_2}2) for independent (X_1, X_2)?
Implement the function MC.R to generate samples from (Rayleigh(\sigma)) distribution[\theta=E_U(\frac{x^2U}{\sigma^2}e^{-(Ux)^2/(2\sigma^2)})]
MC.R <- function(x,sigma,n,antithetic) { n <- numeric(n) u <- runif(n/2) #generate random u ~ U(0,1) if (!antithetic) v <- runif(n/2) #if not antithentic then generate the other half of n from U(0,1) else v <- 1 - u #if antithentic then generate the other half of n as 1-u u <- c(u, v) #combine the whole u cdf <- numeric(length(x)) for (i in 1:length(x)) { g <- x[i]^2*u/(sigma^2)*exp(-(u *x[i])^2/(2*sigma^2)) #the estimate of x multipling density of Rayleigh for each x cdf[i] <- mean(g) #the estimate of cdf } cdf }
The cdf of Rayleigh distribution:(F(x;\sigma)=1-e^{-x^2/{2\sigma^2}},x\geq0)
set.seed(1234) sigma <- 1;n <- 1e4 x <- R <- seq(.1, 2.5,length=5) #value the x for(i in 1:length(x)) R[i] <- 1-exp(-x[i]^2/(2*sigma^2)) #The theoretical cdf of Rayleigh MC1 <- MC.R(x,sigma,n,antithetic = FALSE) #(x1+x2)/2 with independent x1,x2 MC2 <- MC.R(x,sigma,n,antithetic = TRUE) #(x+x`)/2 with negatively correlated x,x` print(round(rbind(x, MC1, MC2, R), 5)) ##compare the estimate with or without antithetic variables and the theoretical value,which shows that both MC1 and MC2 fit well while MC2 fits better.
Next compare the variance and calculate the percent of reduction from MC1 to MC2
set.seed(1234) m <- 1e2;sigma <- 1;x <- 6;n <- 1e4 MC1 <- MC2 <- numeric(m) for (i in 1:m) { MC1[i] <- MC.R(x,sigma,n,antithetic = FALSE) MC2[i] <- MC.R(x,sigma,n,antithetic = TRUE) } var1 <- var(MC1) var2 <- var(MC2) ratio <- (var1-var2)/var1 #the percent of reduction print(c(var1,var2,ratio))
Find two importance functions f1 and f2 that are supported on ((1, \infty)) and are ‘close’ to [g(x) = \frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}, x > 1.] Which of your two importance functions should produce the smaller variance in estimating [\int_1^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx] by importance sampling? Explain.
[g(x) = \frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}, x > 1.] [f_1(x)=\frac1{x^2},x>1;f_2(x)=\frac{x}{\sqrt{2\pi}}e^{(1-x^2)/4},x>1]
x <- seq(1,7,.01) w<-2 g <- exp(-x^2/2)*x^2/sqrt(2*pi) f1 <- 1/(x^2) f2 <- x*exp((1-x^2)/4)/sqrt(2*pi) gs <- c(expression(g(x)==e^{-x^2/2}*x^2/sqrt(2*pi)),expression(f[1](x)==1/(x^2)),expression(f[2](x)==x*e^{(1-x^2)/4}/sqrt(2*pi))) #for color change lty to col par(mfrow=c(1,2)) #figure (a) plot(x, g, type = "l", ylab = "", ylim = c(0,2),lwd = w,col=1,main='(A)') lines(x, f1, lty = 2, lwd = w,col=2) lines(x, f2, lty = 3, lwd = w,col=3) legend("topright", legend = gs, lty = 1:3, lwd = w, inset = 0.02,col=1:3) #figure (b) plot(x, g/f1, type = "l",ylim = c(0,3.2),ylab = "",lty = 2, lwd = w,col=2) lines(x, g/f2, lty = 3, lwd = w,col=3) legend("topright", legend = gs[-1], lty = 2:3, lwd = w, inset = 0.02,col=2:3) ##The figures above show that f2 is more similiar to g according to figure(A), and it is also more aclinic than f1 in figure(B).f2 is the better estimate than f1.
g <- function(x) {x^2*exp(-x^2/2)/sqrt(2*pi)*(x>1)} m <- 10000 #size theta.hat <- se <- numeric(2) #using inverse transform method to generate random sample f1 u <- runif(m) x <- 1/(1-u) fg <- g(x)*x^2 theta.hat[1] <- mean(fg) se[1] <- sd(fg) #sigma for f1 #using inverse transform method to generate random sample f2 u <- runif(m) x <- (4*(1-log((1-u)*sqrt(2*pi)/2)))^0.5 fg <- g(x)/(x*exp((1-x^2)/4)/sqrt(2*pi)) theta.hat[2] <- mean(fg) se[2] <- sd(fg) rbind(theta.hat,se) ##As we have inferred in the figures, the data also prensents that f2 is the better function to get closer to g with a smaller variance.
Obtain a Monte Carlo estimate of [\int_1^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx] by importance sampling.
We choose the f2 as the f to get close to g according to question 3. [f(x)=\frac{x}{\sqrt{2\pi}}exp{(1-x^2)/4}] [\int_1^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx=E(g(x)/f(x))]
g <- function(x) {x^2*exp(-x^2/2)/sqrt(2*pi)*(x>1)} m <- 1e7 #size u <- runif(m) #generate the random u ~U(0,1) x <- (4*(1-log((1-u)*sqrt(2*pi)/2)))^0.5 #using inverse transform method to generate random sample f fg <- g(x)/(x*exp((1-x^2)/4)/sqrt(2*pi)) theta.hat <- mean(fg) #the estimate of E(g(x)/f(x)),also the estimate of integral theta.hat
Let X be a non-negative random variable with $?? = E[X] < \infty$. For a random sample $x_1, . . . , x_n$ from the distribution of X, the Gini ratio is defined by$$G=\frac1{2n^2\mu}\sum_{j=0}^n\sum_{i=1}^n|x_i-x_j|.$$ The Gini ratio is applied in economics to measure inequality in income distribution (see e.g. [163]). Note that G can be written in terms of the order statistics $x_{(i)}$ as $$G=\frac1{n^2\mu}\sum_{i=1}^n(2i-n-1)x_{i}$$ If the mean is unknown, let $\hat{G}$ be the statistic G with $\mu$ replaced by $\bar{x}$. Estimate by simulation the mean, median and deciles of $\hat{G}$ if X is standard lognormal.Repeat the procedure for the uniform distribution and Bernoulli(0.1). Also construct density histograms of the replicates in each case.
GiniEstimation <- function(cdf,mu,n,m,a,b) # cdf for the supposed distribution,mu for theoritical expectation,n for size of random numbers,m for repeating times, a and b for the better xlim for histogram(u can choose a better one when u observed its distribution in histogram with random a b ) { Gcdf<-numeric(m) set.seed(1234) v <- numeric(n) for(i in 1:m) { x<-sort(cdf(n)) #sort the x for(j in 1:n) { v[j] <- (2*j-n-1)*x[j] } s <- sum(v) Gcdf[i]<-(1/(n^2*mu))*s #caculate the G for every i } mean<-mean(Gcdf) median<-quantile(Gcdf,0.5) deciles<-quantile(Gcdf,probs=seq(0,1,0.1)) print(mean) print(median) print(deciles) hist(Gcdf,freq=F,xlim=c(a,b),main="Gini ratio of supposed distribution") #density histograms of the replicates }
The standard lognormal with theoritical expectation of $X$: $EX=e^{1/2}$
GiniEstimation(rlnorm,exp(0.5),1e3,1e2,0.45,0.65) ##The mean is much approaching to median as the data shows and we can also roughly conclude from the histogram.
The uniform distribution with$X\sim \mathbb{U}(0,1)$$E(X)=0.5$
GiniEstimation(runif,0.5,1e3,1e2,0.32,0.35) ##The mean is much approaching to median as the data shows and we can also roughly conclude from the histogram.
The Bernoulli(0.1) distribution with $X\sim Bernoulli(1000,0.1)$$E(X)=100$
n <- m <- 1e3 mu <- 100 Gcdf<-numeric(m) set.seed(1234) v <- numeric(n) for(i in 1:m) { x<-sort(rbinom(n,1000,0.1)) #sort the x for(j in 1:n) { v[j] <- (2*j-n-1)*x[j] } s <- sum(v) Gcdf[i]<-(1/(n^2*mu))*s #caculate the G for every i } mean<-mean(Gcdf) median<-quantile(Gcdf,0.5) deciles<-quantile(Gcdf,probs=seq(0,1,0.1)) print(mean) print(median) print(deciles) hist(Gcdf,freq=F,xlim=c(0.048,0.058),main="Gini ratio of Bernoulli(0.1)") #density histograms of the replicates ##The mean is much approaching to median as the data shows and we can also roughly conclude from the histogram.
Construct an approximate 95% confidence interval for the Gini ratio $\gamma = E[G]$ if X is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.
First we stimulate the $\sigma=sd(logX),logX\sim normal(0,1)$,then we use the$\sigma$to caculate$\gamma$$$\gamma=E[G]=erf(\sigma/2)=\frac2{\sqrt{\pi}}\int_0^{\frac{\sigma}{2}}e^{-t^2}dt$$,then we can caculate the CI.
n<-1000 #generate 1000 random numbers m<-100 #generate 100 gini ratio every times and repeate 100 times alpha <- 0.05 UCL <- LCL <- numeric(m) G <- function(b) #the real value of gamma,b for sigma { return(2*pnorm(b/sqrt(2))-1) } CI.sigma<-function(x,n){ #function to construct confident interval for sigma s2<-(1/(n-1))*sum((x-mean(x))^2) LCL<- (n-1)*s2/qchisq(1-alpha/2,n-1) UCL<-(n-1)*s2/qchisq(alpha/2,n-1) c(LCL,UCL) } ciG<-cbind(numeric(m),numeric(m)) C<-CR<-numeric(m) e <- G(1) set.seed(1234) for(j in 1:m){ for(i in 1:m){ x<-rlnorm(n) ci<-CI.sigma(log(x),n) ciG[i,1]<-G(ci[1]) ciG[i,2]<-G(ci[2]) if(e>ciG[i,1] && e<ciG[i,2]) # for every ith ci,judge whether real value e in the ci C[i]<-1 else C[i]<-0 } CR[j]<-mean(C) } mean(CR) ##The coverage rate of estimation is a little bigger than 0.95.
Tests for association based on Pearson product moment correlation $\rho$, Spearman??s rank correlation coefficient $\rho_s$, or Kendall??s coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_s$ or $\tau$ are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X, Y ) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.
Assume $(X,Y)\sim N(\mu,\Sigma)$, with $\mu=(0,0)^T$ and $\Sigma$ is of following: \begin{matrix} 1 & \rho\ \rho & 1 \end{matrix}
Spearman's rank correlation:
library(MASS) n <- 20 m <- 1000 rho.s <- c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1)) #alternatives M <- length(rho.s) power.s <- numeric(M) set.seed(1234) for (i in 1:M) { rho <-rho.s[i] pvalues.s <- replicate(m, expr = { #simulate under alternative mu1 x <- mvrnorm(n, c(0,0), matrix(c(1,rho,rho,1),2,2)) cortest.s <- cor.test(x[,1],x[,2], alternative = "two.sided",method="spearman") cortest.s$p.value } ) power.s[i] <- mean(pvalues.s<= .05) } power.s
Kendall's coefficient:
n <- 20 m <- 1000 rho.k <- c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1)) #alternatives M <- length(rho.k) power.k <- numeric(M) set.seed(1234) for (i in 1:M) { rho <-rho.k[i] pvalues.k <- replicate(m, expr = { #simulate under alternative mu1 x <- mvrnorm(n, c(0,0), matrix(c(1,rho,rho,1),2,2)) cortest.k <- cor.test(x[,1],x[,2], alternative = "two.sided",method="kendall") cortest.k$p.value } ) power.k[i] <- mean(pvalues.k<= .05) } power.k
Pearson's correlation:
n <- 20 m <- 1000 rho.p <- c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1)) #alternatives M <- length(rho.p) power.p <- numeric(M) set.seed(1234) for (i in 1:M) { rho <-rho.p[i] pvalues.p <- replicate(m, expr = { #simulate under alternative mu1 x <- mvrnorm(n, c(0,0), matrix(c(1,rho,rho,1),2,2)) cortest.p <- cor.test(x[,1],x[,2], alternative = "two.sided",method="pearson") cortest.p$p.value } ) power.p[i] <- mean(pvalues.p<= .05) } power.p
Make comparision
power.rho<-rbind(power.s,power.k,power.p) colnames(power.rho)<-c(seq(-0.8,-0.1,0.1),seq(0.1,0.8,0.1)) power.rho
the result shows that the nonparametric tests based on $??_s$ or $??$ are less powerful than the correlation test in bivariate normal.
The example $X\sim t_{2}$, $Y\sim U(-1,0)$ when $X<0$ and $Y\sim U(0,1)$ when $X\ge0$.$X,Y$ are dependent.
n <- 20 m <- 1000 set.seed(1234) pvalues.s1 <- replicate(m, expr = { #simulate under alternative mu1 x <- rt(n,2) y<-numeric(n) for(i in 1:n){ if(x[i]<0) y[i]<-runif(1,-1,0) else y[i]<-runif(1,0,1) } cortest.s1 <- cor.test(x,y, alternative = "two.sided",method="spearman") cortest.s1$p.value } ) power.s1 <- mean(pvalues.s1<= .05) power.s1 ##nice power
n <- 20 m <- 1000 set.seed(431) pvalues.p1 <- replicate(m, expr = { #simulate under alternative mu1 x <- rt(n,2) y<-numeric(n) for(i in 1:n){ if(x[i]<0) y[i]<-runif(1,-1,0) else y[i]<-runif(1,0,1) } cortest.p1 <- cor.test(x,y, alternative = "two.sided",method="pearson") cortest.p1$p.value } ) power.p1 <- mean(pvalues.p1<= .05) power.p1
Power of the nonparametric test Spearman's rank correlation coefficient is much better according to the results.
Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.
This question is about the correlation between LSAT and GPA in law data. An unbiased estimate of the bias $E(\hat\theta)-\theta_0$ is$$(n-1)(\bar{\hat\theta}{(\cdot)}-\hat\theta)$$ An unbiased estimate of $se(\hat\theta)$ is$$ \sqrt{\frac{n-1}n\sum{i=1}^n(\hat\theta_{(i)}-\bar{\hat\theta}_{(\cdot)})^2}$$
#declare the varibles library(bootstrap) n <-nrow(law) #sample size theta.jack <- numeric(n)
theta.hat <- cor(law[,1], law[,2]) #law[,1] is LSAT, law[,2] is GPA b.cor <- function(x,i)cor(x[i,1],x[i,2]) for(i in 1:n) { theta.jack[i] <- b.cor(law,(1:n)[-i]) #leave one out } bias.jack <- (n-1)*(mean(theta.jack)-theta.hat) #the eatimate bias se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2)) #the eatimate standard error round(c(original=theta.hat,bias=bias.jack,se=se.jack),3) ##Jackknife estimate of the bias and the standard error of the correlation between LSAT and GPA in law data is shown above.
Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures $\frac{1}{\lambda}$ by the standard normal, basic, percentile,and BCa methods. Compare the intervals and explain why they may differ.
the MLE of $\lambda$:$\hat\lambda=\frac{n}{\sum_{i=0}^{n}x_i}=\frac{1}{\bar{X}}$
#declare the varibles library(boot) n <- 100 #sample size m <- 100# times for replicates ci.norm<-ci.basic<-ci.perc<-ci.bca<-matrix(NA,m,2)
boot.mean <- function(x,i) mean(x$hours[i]) de <- boot(aircondit,statistic=boot.mean,R = 999) ci <- boot.ci(de,type=c("norm","basic","perc","bca")) #the confidence interval ci ##The four kinds of CIs have been shown above,and they differ greatly,it is may beacause of the small fixed sample size which is not appproching the normal distribution.
Compare the four kinds of CI
set.seed(4) mu <- mean(aircondit$hours)#the estimate of mean time lambda <- 1/mu tboot.mean <- function(x,i) mean(x[i]) for (i in 1:m) { U<-runif(n);R<--log(1-U)/lambda #generate random sample with expotential distribution de <- boot(data=R,statistic=tboot.mean, R = 999) ci <- boot.ci(de,type=c("norm","basic","perc","bca")) ci.norm[i,]<-ci$norm[2:3];ci.basic[i,]<-ci$basic[4:5] ci.perc[i,]<-ci$percent[4:5];ci.bca[i,]<-ci$bca[4:5] } cat('norm =',mean(ci.norm[,1]<=mu & ci.norm[,2]>=mu), 'basic =',mean(ci.basic[,1]<=mu & ci.basic[,2]>=mu), 'perc =',mean(ci.perc[,1]<=mu & ci.perc[,2]>=mu), 'BCa =',mean(ci.bca[,1]<=mu & ci.bca[,2]>=mu)) ##The result above showS that the percentile method has the best effect in expotential distribution with lambda equals the mle of lambda from aircondit data
Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat\theta$.
We have the empirical $\theta$$$\hat\theta=\frac{\hat\lambda_1}{\sum_{j=0}^{m}\hat\lambda_j}$$m is the column number of data,$\hat\lambda_j$is the empirical sorted eigenvalue by decreasing order of covariance matrix.
#declare the varibles library(bootstrap) n <- nrow(scor) m <- ncol(scor) covmatrix <- matrix(NA,m,m)# for covariance matrix eigenvalues <- lambda <- numeric(m) #for eigenvalues,sorted eigenvalues(decreasing)
covmatrix <- cov(scor) #covariance matrix eigenvalues <- eigen(covmatrix)$values #eigenvalues lambda <- sort(eigenvalues) #sort the eigenvalues theta.hat <- lambda[1]/sum(lambda) #the empirical theta b.theta <- function(x,i) { covmatrix <- cov(x[i,]) eigenvalues <- eigen(covmatrix)$values lambda <- sort(eigenvalues) theta.hat <- lambda[1]/sum(lambda) } for(i in 1:n) { theta.jack[i] <- b.theta(scor,(1:n)[-i]) #leave one out } bias.jack <- (n-1)*(mean(theta.jack)-theta.hat) #the eatimate bias se.jack <- sqrt((n-1)*mean((theta.jack-theta.hat)^2)) #the eatimate standard error round(c(original=theta.hat,bias=bias.jack,se=se.jack),3) ##Jackknife estimate of the bias and the standard error of the theta.hat is shown above.
In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.
#declare the varibles library(DAAG) attach(ironslag) n <- length(magnetic) #in DAAG ironslag e1 <- e2 <- e3 <- e4 <- numeric(n) # for the prediction error
# for n-fold cross validation # fit models on leave-two-out samples for (k in 1:round(n/2)) { y <- magnetic[-(2*k-1)] #delete the x_2k-1 y <- y[-2*k] #delete the x_2k x <- chemical[-2*k-1] x <- x[-2*k] J1 <- lm(y ~ x) #Linear model yhat1.1 <- J1$coef[1] + J1$coef[2] * chemical[2*k-1] e1[2*k-1] <- magnetic[2*k-1] - yhat1.1 yhat1.2 <- J1$coef[1] + J1$coef[2] * chemical[2*k] e1[2*k] <- magnetic[2*k] - yhat1.2 J2 <- lm(y ~ x + I(x^2)) #Quardratic model yhat2.1 <- J2$coef[1] + J2$coef[2] * chemical[2*k-1] + J2$coef[3] * chemical[2*k-1]^2 e2[2*k-1] <- magnetic[2*k-1] - yhat2.1 yhat2.2 <- J2$coef[1] + J2$coef[2] * chemical[2*k] + J2$coef[3] * chemical[2*k]^2 e2[2*k] <- magnetic[2*k] - yhat2.2 J3 <- lm(log(y) ~ x) #Exponential model logyhat3.1 <- J3$coef[1] + J3$coef[2] * chemical[2*k-1] yhat3.1 <- exp(logyhat3.1) e3[2*k-1] <- magnetic[2*k-1] - yhat3.1 logyhat3.2 <- J3$coef[1] + J3$coef[2] * chemical[2*k] yhat3.2 <- exp(logyhat3.2) e3[2*k] <- magnetic[2*k] - yhat3.2 J4 <- lm(log(y) ~ log(x)) #Log-Log model logyhat4.1 <- J4$coef[1] + J4$coef[2] * log(chemical[2*k-1]) yhat4.1 <- exp(logyhat4.1) e4[2*k-1] <- magnetic[2*k-1] - yhat4.1 logyhat4.2 <- J4$coef[1] + J4$coef[2] * log(chemical[2*k]) yhat4.2 <- exp(logyhat4.2) e4[2*k] <- magnetic[2*k] - yhat4.2 } c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2)) ##The result above shows that the J2 Model 2, the quadratic model,would be the best fit for the data according to the prediction error criterion.
J2
##The related quadratic model is shown above
The model is$$\hat Y=19.15533-0.77001X+0.03697X^2$$
Implement the two-sample Cramer-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.
The Cram′er-von Mises statistic[W_2=\frac{mn}{(m+n)^2}[\sum_{i=1}^{n}(F_n(x_i)-G_m(x_i))^2+\sum_{j=1}^{n}(F_n(y_j)-G_m(y_j))^2]]]
#declare the varibles attach(chickwts) #for data x <- sort(as.vector(weight[feed == "soybean"])) y <- sort(as.vector(weight[feed == "linseed"])) detach(chickwts) z <- c(x,y) #pool the sample m <- length(x);n <- length(y);N <- length(z) S <- numeric(N) #for Cram′er-von Mises statistic R<- 999 #number of replicates W <- k <- numeric(R) #W for storage of replicates k for permutation
#caculate the Cram′er-von Mises statistic cvm <- function(x1,y1){ Fn <- ecdf(x1);Gm <- ecdf(y1);z1 <- c(x1,y1) for (i in 1:N) { S[i] <- (Fn(z1[i])-Gm(z1[i]))^2 } m*n/(m+n)^2*sum(S) } W0 <- cvm(x,y) for (i in 1:R) { #generate indices k for the first sample k <- sample(1:N, size = m, replace = FALSE) x1 <- z[k] y1 <- z[-k] #complement of x1 W[i] <- cvm(x1,y1) } p <- mean(c(W0, W) >= W0)#cacaulate the p value p ## As the result shows above,it does not support the alternative hypothesis that distributions differ.
Design experiments for evaluating the performance of the NN, energy, and ball methods in various situations.
(1) Unequal variances and equal expectations
(2) Unequal variances and unequal expectations
(3) Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
(4) Unbalanced samples (say, 1 case versus 10 controls)
Note: The parameters should be choosen such that the powers are distinguishable (say, range from 0.3 to 0.9).
#declare the varibles library(RANN) library(energy) library(Ball) library(boot) m <- 103; k<-3; p<-2; mu <- 0.5; set.seed(12345) n1 <- n2 <- 50; R<-999; n <- n1+n2; N = c(n1,n2)
Tn <- function(z, ix, sizes,k) { n1 <- sizes[1] n2 <- sizes[2] n <- n1 + n2 z <- z[ix, ] o <- rep(0, NROW(z)) z <- as.data.frame(cbind(z, o)) NN <- nn2(z, k=k+1) block1 <- NN$nn.idx[1:n1,-1] block2 <- NN$nn.idx[(n1+1):n,-1] i1 <- sum(block1 < n1 + .5) i2 <- sum(block2 > n1 + .5) return((i1 + i2) / (k * n)) } eqdist.nn <- function(z,sizes,k){ boot.obj <- boot(data=z,statistic=Tn,R=R, sim = "permutation", sizes = sizes,k=k) ts <- c(boot.obj$t0,boot.obj$t) p.value <- mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p.values <- matrix(NA,m,3) #Unequal variances and equal expectations for(i in 1:m){ x <- matrix(rnorm(n1*p),ncol=p); y <- cbind(rnorm(n2),rnorm(n2,mean=mu)); z <- rbind(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value #NN p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value #energy method p.values[i,3] <- bd.test(x=x,y=y,R=R,seed=i*12345)$p.value #ball method } alpha <- 0.05 #confidence level pow <- colMeans(p.values<alpha) pow
#Unequal variances and unequal expectations mu <- 0.6 for(i in 1:m){ x <- matrix(rnorm(n1*p),ncol=p); y <- cbind(rnorm(n2),rnorm(n2,mean=mu)); z <- rbind(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value #NN p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value #energy method p.values[i,3] <- bd.test(x=x,y=y,R=R,seed=i*12345)$p.value #ball method } alpha <- 0.05 #confidence level pow <- colMeans(p.values<alpha) pow
#Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions) for(i in 1:m){ x <- cbind(rt(n1,1),rnorm(n2, mean = sample(c(0, 1), size = n2, prob = c(0.5, 0.5), replace = T), sd =1 )) y <- cbind(rt(n2,1),rnorm(n2, mean = sample(c(1, 2), size = n2, prob = c(0.5, 0.5), replace = T), sd =1 )) z <- rbind(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value #NN p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value #energy method p.values[i,3] <- bd.test(x=x,y=y,R=R,seed=i*12345)$p.value #ball method } alpha <- 0.05 #confidence level pow <- colMeans(p.values<alpha) pow
#Unbalanced samples (say, 1 case versus 10 controls) n1 <- 10;n2 <- 1e2;N = c(n1,n2) for(i in 1:m){ x <- matrix(rnorm(n1*p),ncol=p); y <- cbind(rnorm(n2),rnorm(n2,mean=mu)); z <- rbind(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value #NN p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value #energy method p.values[i,3] <- bd.test(x=x,y=y,R=R,seed=i*12345)$p.value #ball method } alpha <- 0.05 #confidence level pow <- colMeans(p.values<alpha) pow
Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchy distribution (see qcauchy or qt with df=1). Recall that a $Cauchy(\theta, \eta)$ distribution has density function $$ f(x) =\frac{1} {\theta\pi(1 + [(x- \eta)/\theta]^2)}, - \infty < x < \infty, \theta>0. $$ The standard Cauchy has the $Cauchy(\theta= 1, \eta = 0)$ density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)
#declare the varibles library(stats) theta <- 1;eta <- 0 m <- 10000 x <- numeric(m)
f <- function(x, theta,eta) { if (theta > 0) return(1/(theta*pi*(1+((x-eta)/theta)^2))) } x[1] <- runif(1,min=-1,max=1) k <- 0 u <- runif(m) for (i in 2:m) { xt <- x[i-1] y <- xt+runif(1,min=-1,max=1) #proposal distribution num <- f(y, theta,eta) * dnorm(xt, mean = y,sd=0.5) den <- f(xt, theta,eta) * dnorm(y, mean = xt,sd=0.5) if (u[i] <= num/den) x[i] <- y else { x[i] <- xt k <- k+1 #y is rejected } } print(k)
Compare the deciles of the generated observations with the deciles of the standard Cauchy distribution
b <- 1001 #discard the burnin sample y <- x[b:m] a <- ppoints(100) QC <- qcauchy(a) #quantiles of Cauchy Q <- quantile(x, a) qqplot(QC, Q, main="",xlim=c(-2,2),ylim=c(-2,2),xlab="Cauchy Quantiles", ylab="Sample Quantiles")
hist(y, breaks="scott", main="", xlab="", freq=FALSE) lines(QC, f(QC,theta,eta))
Rao $[220, Sec. 5g]$ presented an example on genetic linkage of 197 animals in four categories (also discussed in $[67, 106, 171, 266])$. The group sizes are $(125, 18, 20, 34)$. Assume that the probabilities of the corresponding multinomial distribution are $$ (\frac12+\frac{\theta}{4},\frac{1-\theta}4,\frac{1-\theta}4,\frac{\theta}4). $$ Estimate the posterior distribution of $\theta$ given the observed sample, using one of the methods in this chapter.
#declare the varibles gsize <- c(125,18,20,34) #group size m <- 5000 w <- .25 burn <- 1000
#the target density prob <- function(y, gsize) { if (y < 0 || y >1) return (0) else return((1/2+y/4)^gsize[1] *((1-y)/4)^gsize[2]*((1-y)/4)^gsize[3]*(y/4)^gsize[4]) } u <- runif(m) #for accept/reject step v <- runif(m, -w, w) #proposal distribution x[1] <- .25 for (i in 2:m) { y <- x[i-1] + v[i] if (u[i] <= prob(y, gsize) / prob(x[i-1], gsize)) x[i] <- y else x[i] <- x[i-1] } xtheta <- x[(burn+1):m] theta.hat <- mean(xtheta) theta.hat ##the estimate of posterior distribution of θ is shown above.
Rao [220, Sec. 5g] presented an example on genetic linkage of 197 animals in four categories (also discussed in [67, 106, 171, 266]). The group sizes are 278 Statistical Computing with R (125, 18, 20, 34). Assume that the probabilities of the corresponding multinomial distribution are$$(\frac{1}{2}+\frac{\theta}{4},\frac{1-\theta}{4},\frac{1-\theta}{4},\frac{\theta}{4}).$$ Estimate the posterior distribution of theta given the observed sample, using one of the methods in this chapter.
#declare the varibles gsize <- c(125,18,20,34) #group size m <- 5000 w <- .25 burn <- 1000 x <- numeric(m)
#the target density prob <- function(y, gsize) { if (y < 0 || y >1) return (0) else return((1/2+y/4)^gsize[1] *((1-y)/4)^gsize[2]*((1-y)/4)^gsize[3]*(y/4)^gsize[4]) } u <- runif(m) #for accept/reject step v <- runif(m, -w, w) #proposal distribution x[1] <- .25 for (i in 2:m) { y <- x[i-1] + v[i] if (u[i] <= prob(y, gsize) / prob(x[i-1], gsize)) x[i] <- y else x[i] <- x[i-1] } x xtheta <- x[(burn+1):m] theta.hat <- mean(xtheta) theta.hat ##the estimate of posterior distribution of theta is shown above.
Write a function to compute the cdf of the Cauchy distribution, which has density$$\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},-\infty
#declare the varibles q <- c(-222,-22,-2,0,4,44,444,Inf) m <-length(q) CDF <- numeric(m) #for CDF with different p eta <- 0;theta <- 1 #the location parameter and the scale parameter of Cauchy density
f <- function(x, eta, theta) { 1/(1+((x-eta)/theta)^2)/theta/pi } for (i in 1:m ){ CDF[i] <- integrate(f,lower=-Inf,upper=q[i],rel.tol=.Machine$double.eps^0.25,eta=eta,theta=theta)$value } pcauchy <- pcauchy(q,location = eta, scale = theta) data.frame(q,pcauchy,CDF) ##As the result shows above, the caculated CDF is the same to the results of pcauchy function with different q.
A-B-O blood type problem (\blacktriangleright) Let the three alleles be A, B, and O.
Genotype AA (~~) BB (~~) OO (~~) AO (~~) BO (~~) AB
Frequency p2 (~~) q2 (~~) r2 (~~) 2pr (~~) 2qr (~~) 2pq (~~) 1
Count (~~~~~n_{AA}) (n_{BB}) (n_{OO}) (n_{AO}) (n_{BO}) (n_{AB}~~) n
(\blacktriangleright) Observed data: (n_{A??} = n_{AA} + n_{AO} = 28 ~~~(A-type),)
(n_{B??} = n_{BB} + n_{BO} = 24 ~~(B-type), n_{OO} = 41~~ (O-type), n_{AB} = 70 ~~(AB-type).)
(\blacktriangleright) Use EM algorithm to solve MLE of p and q (consider missing data (n_{AA} and n_{BB})).
(\blacktriangleright) Record the maximum likelihood values in M-steps, are they increasing?
#declare the varibles nA <- 28;nB <- 24;nAB <- 70;nOO <- 41 #observed data L <- c(0.3,0.2) #initial values for p q tol <- .Machine$double.eps^0.5 # for caculation accuracy n <- 10000 #max. number of iterations EI <- M <- numeric(n)# for the values of "M-step"
p <- L[1];q <- L[2] #initial values for (i in 1:n) { a<-p/(2-p-2*q) b<-q/(2-2*p-q) fp=nA*(1+a)+nAB fq=nB*(1+b)+nAB fpq=nA*(1-a)+nB*(1-b)+2*nOO M[i]<-fp*log(p)+fpq*log(1-p-q)+fq*log(q) #Record the maximum likelihood values in M-steps x<-p; y<-q #store ith values of p and q p<-fp/(fp+fq+fpq); q<-fq/(fp+fq+fpq) #update the parameters #k<-k+1 #the times until converge if (abs(p-x)/x<tol && abs(q-y)/y<tol) break #control the iterations p.hat<-p; q.hat<-q } EI<-M[1:i]#the values of "M-step" data.frame(p.hat,q.hat) ##the estimates of p q is shown above.
i ##the times until converge
-EI
x <- seq(1,i,1) plot(x,EI) ##the maximum likelihood values in M-steps are increasing
Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list:
formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt )
For loops
out_formulas <- vector('list', length(formulas)) for(i in seq_along(formulas)) { out_formulas[[i]] <- lm(formulas[[i]], data = mtcars) } out_formulas
lapply()
lapply(formulas, lm, data = mtcars)
Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply().Can you do it without an anonymous function?
bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] })
for loops
out_bootstrap <- vector('list', length(bootstraps)) for(i in seq_along(bootstraps)) { out_bootstrap[[i]] <- lm(mpg~disp, data = bootstraps[[i]]) } out_bootstrap
lapply()
lapply(bootstraps, lm, formula = mpg~disp)
For each model in the previous two exercises, extract R2 using the function below.
rsq <- function(mod) summary(mod)$r.squared
lapply(out_formulas, rsq)
lapply(out_bootstrap, rsq)
The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.
trials <- replicate( 100, t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE )
Extra challenge: get rid of the anonymous function by using [[ directly.
sapply(trials, function(mod) mod$p.value)
Extra challenge:
sapply(trials, `[[`, 'p.value')
Implement a combination of Map() and vapply() to create an lapply() variant that iterates in parallel over all of its inputs and stores its outputs in a vector (or a matrix). What arguments should the function take?
library(parallel) mcvMap <- function(f, FUN.VALUE , ...) { out <- mcMap(f, ...) vapply(out, identity, FUN.VALUE) }
Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test()or by coding from the mathematical definition(http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test).
| n_11 | n_12 | n_13 | n_14 | |------|------|------|------| | n_21 | n_22 | n_23 | n_24 | the $X^2=\sum_{i=1}^{2}\sum_{j=1}^{4}(n_ij-n_{i??}n_{??j}/n)^2/(n_{i??}n_{??j}/n)$
expected <- function(colsum, rowsum, total) { (colsum / total) * (rowsum / total) * total } #for $n_{i??}n_{??j}/n$ chi_stat <- function(observed, expected) { ((observed - expected) ^ 2) / expected } chisq_test2 <- function(x, y) {#x for the first row, y for the second row total <- sum(x) + sum(y) rowsum_x <- sum(x) rowsum_y <- sum(y) chistat <- 0 for (i in seq_along(x)) { colsum <- x[i] + y[i] expected_x <- expected(colsum, rowsum_x, total) expected_y <- expected(colsum, rowsum_y, total) chistat <- chistat + chi_stat(x[i], expected_x) chistat <- chistat + chi_stat(y[i], expected_y) } chistat }
#examples of the use of the new chisq_test2 print(chisq_test2(seq(3, 8), seq(4, 9))) print(chisq.test(seq(3, 8), seq(4, 9))) ##there is a doubt that the value of chisq from the new chisq_test is much different from the chisq.test function
Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?
table2 <- function(x, y) { x_val <- unique(x) y_val <- unique(y) mat <- matrix(0L, length(x_val), length(y_val)) for (i in seq_along(x)) { mat[which(x_val == x[[i]]), which(y_val == y[[i]])] <- mat[which(x_val == x[[i]]), which(y_val == y[[i]])] + 1L } dimnames <- list(x_val, y_val) names(dimnames) <- as.character(as.list(match.call())[-1]) # R has names for dimnames... :/ tab <- array(mat, dim = dim(mat), dimnames = dimnames) class(tab) <- "table" tab }
#examples of the use of the new table2() a <- c(1, 2, 3) identical(table(a, a), table2(a, a)) b <- c(2, 3, 4) identical(table(a, b), table2(a, b)) c <- c(1, 2, 3, 1, 2, 3) d <- c(2, 3, 4, 2, 3, 4) identical(table(c, d), table2(c, d)) e <- c(1, 2, 2) identical(table(a, e), table2(a, e)) identical(table(b, e), table2(b, e)) identical(table(e, e), table2(e, e)) f <- c(1, 1, 1) identical(table(f, f), table2(f, f)) identical(table(e, f), table2(e, f)) g <- c(1, 4, 9) identical(table(g, g), table2(g, g)) identical(table(g, f), table2(g, f))
apply table2() to chisq_test2
expected <- function(colsum, rowsum, total) { (colsum / total) * (rowsum / total) * total } #for $n_{i??}n_{??j}/n$ chi_stat <- function(observed, expected) { ((observed - expected) ^ 2) / expected } x <- y <- NULL z <- table2(x,y) chisq_test3 <- function(z) {#x for the first row, y for the second row total <- sum(x) + sum(y) rowsum_x <- sum(x) rowsum_y <- sum(y) chistat <- 0 for (i in seq_along(x)) { colsum <- x[i] + y[i] expected_x <- expected(colsum, rowsum_x, total) expected_y <- expected(colsum, rowsum_y, total) chistat <- chistat + chi_stat(x[i], expected_x) chistat <- chistat + chi_stat(y[i], expected_y) } chistat }
compare the time
x <- seq(3, 8) y <- seq(4, 9) system.time(print((chisq_test2(x, y)))) system.time(print(chisq_test3(table2(x, y)))) ##we can see that the new table function dose speed up the new chisq_test function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.