MCestimate=function(n,x){ u=runif(n) #Generate iid Uniform(0,1) random numbers cdf=numeric(length(x)) for(i in 1:length(x)){ g=30*x[i]^3*u^2*(1-u*x[i])^2 cdf[i]=mean(g) #the estimated value of the cdf } phi=pbeta(x,3,3) #the estimated values returned by the pbeta function print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function. } MCestimate(1000,c(1,2,3,4,5,6,7,8,9)/10)
chisq_test=function(x, y) { n=sum(x) + sum(y) ni._x=sum(x) ni._y=sum(y) chisq.statistic=0 for (i in seq_along(x)) { n.j=x[i] + y[i] expected_x=(ni._x*n.j)/n expected_y=(ni._y*n.j)/n chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y } chisq.statistic } chisq_test(c(3,8,4,9),c(6,9,3,5))
Jacknife.estimate=function(x,y){ set.seed(1) n=length(x) theta.hat=cor(x,y) #the correlation estimate print(theta.hat) #compute the jackknife replicates, leave-one-out estimates theta.jack=numeric(n) for(i in 1:n){ theta.jack[i]=cor(x[-i],y[-i]) } bias=(n-1)*(mean(theta.jack)-theta.hat) print(bias) #a jackknife estimate of the bias of the correlation statistic se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2)) print(se) #a jackknife estimate of the standard error of the correlation statistic } Jacknife.estimate(c(0.1,0.5,1,1.2,0.2,0.4,0.8,0.6,0.8,1.3),c(4,2.7,5.4,3.6,6.3,3.9,4.7,3.9,2.8,5.6))
creat a frame
ID=c(1,2,3,4) age=c(25,34,28,52) status=c("poor","improved","excelled","poor") patientdata<-data.frame(ID,age,status) patientdata
Perform calculations on the matrix
m1=matrix(1,nr=2,nc=2) m2=matrix(2,nr=2,nc=2) m=rbind(m1,m2) #merge matrices m n=cbind(m1,m2) n
The 2D graphics of 10 pairs of random values
x=rnorm(10) y=rnorm(10) plot(x,y,xlab="Ten random values",ylab="Ten other values",xlim=c(-2,2),ylim=c(-2,2),main="How to customize a plot with R")
A discrete random variable X has probability mass function | x | 0 | 1 | 2 | 3 | 4 | |------|-----|-----|-----|-----|-----| | p(x) | 0.1 | 0.2 | 0.2 | 0.2 | 0.3 | Use the inverse transform method to generate a random sample of size 1000 from the distribution of X. Construct a relative frequency table and compare the empirical with the theoretical probabilities. Repeat using the R sample function.
Idea: Use the inverse transformation method and sample function to generate random Numbers, and then compare with the theoretical probability.And in this question,I use two methods to solve the problem.
#Solution 1 x=c(0,1,2,3,4) p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities cp=cumsum(p) n=1000 r=numeric(n) r=x[findInterval(runif(n),cp)+1] # the 1000 random numbers generated from the distribution of X ct=as.vector(table(r)) # take the numbers of every value of X in R and form a vector pr=ct/sum(ct) #the empirical probabilities s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function st=as.vector(table(s)) # take the numbers of every value of X in R and form a vector psample=st/sum(st) #the empirical probabilities comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample) #the comparison of the empirical,the theoretical and the sample probabilities comparison a=c(ct,st) rnames=c(0,1,2,3,4) cnames=c("ct","st") mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function mymatrix barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function
#Solution 2 p=c(0.1,0.2,0.2,0.2,0.3) #the theoretical probabilities x=rep(1,1000) # replicates the values of 1 for(i in 1:1000){ u=runif(1) if(u<0.1) x[i]=0 else{ if(u<0.3) x[i]=1 else{ if(u<0.5) x[i]=2 else{ if(u<0.7) x[i]=3 else x[i]=4 } } } } # x :the 1000 random numbers generated from the distribution of X ct=as.vector(table(x)) # take the numbers of every value of X in R and form a vector pr=ct/sum(ct) #the empirical probabilities s=sample(c(0,1,2,3,4),size=n,replace=TRUE,prob=c(0.1,0.2,0.2,0.2,0.3)) # the 1000 random numbers generated from the sample function st=as.vector(table(s)) # take the numbers of every value of X in R and form a vector psample=st/sum(st) #the empirical probabilities comparison=data.frame(x=c(0,1,2,3,4),p,pr,psample) #the comparison of the empirical,the theoretical and the sample probabilities comparison a=c(ct,st) rnames=c(0,1,2,3,4) cnames=c("ct","st") mymatrix=matrix(a,nrow=5,ncol=2,byrow=FALSE,dimnames=list(rnames,cnames)) #Generate a matrix whose elements are the frequencies of each random number generated by inverse transformation and sample function mymatrix barplot(mymatrix,xlab="random numbers",ylab="Frequency",col=c("red","yellow","blue","green","pink"),legend=rownames(mymatrix),beside=TRUE) #the barplot of the frequencies of each random number generated by inverse transformation and sample function
Write a function to generate a random sample of size $n$ from the $Beta(a,b)$ distribution by the acceptance-rejection method. Generate a random sample of size 1000 from the $Beta(3,2)$ distribution. Graph the histogram of the sample with the theoretical $Beta(3,2)$ density superimposed.
# beta(3,2) with pdf f(x)=12x^2(1-x),0<x<1 # g(x)=x^2,0<x<1 and c=12 n=1000 k=0 #counter for accepted j=0 #iterations y=numeric(n) while(k<n){ u=runif(1) j=j+1 x=runif(1) #andom variate from g if(x^2*(1-x)>u){ #we accept x k=k+1 y[k]=x #deliver y=x } } j # y:the 1000 random numbers generated from the Beta(a,b) distribution plot(y) #the scatter plot of the 1000 random numbers generated from the Beta(a,b) distribution hist(y,prob=TRUE,main=expression(f(x)==24*x*(1-x)-12*x^2)) #density histogram of sample t=seq(0,1,.01) lines(t,12*t^2*(1-t)) #density curve f(x)
Simulate a continuous Exponential-Gamma mixture. Suppose that the rate parameter $\Lambda$ has $Gamma(r,\beta)$ distribution and $Y$ has $Exp(\Lambda)$ distribution. That is,$(Y |\Lambda = \lambda)$~$f_{Y}(y|\lambda) = \lambdae^{-\lambday}$. Generate 1000 random observations from this mixture with $r = 4$ and $\beta= 2$.
#generate a Poisson-Gamma mixture n=1000 r=4 beta=2 lambda=rgamma(n, r, beta) #lambda is random #now supply the sample of lambda???s as the exponential mean y=rexp(n, lambda) #the 1000 random numbers generated from the mixture distribution plot(y) # the scatter plot of the 1000 random numbers generated from the mixture distribution
Write a function to compute a Monte Carlo estimate of the $Beta(3, 3)$ cdf, and use the function to estimate $F(x)$ for $x = 0.1,0.2,...,0.9$. Compare the estimates with the values returned by the $pbeta$ function in R.
Idea: First,the $Beta(3, 3)$ pdf is $$p(x)=30x^2(1-x)^2,0<x<1$$,so the $Beta(3, 3)$ cdf is $$F(x)=\int_{0}^{x}30t^2(1-t)^2dt$$. Making the substitution$y = t/x$, we have $dt = xdy$ and $\theta=\int_{0}^{1}30x^2y^2(1-xy)^2xdy$. Where the random variable Y has the $Uniform(0,1)$ distribution.Thus, $\theta=E_{Y}[30x^3Y^2(1-xY)^2]$.Generate iid $Uniform(0,1)$ random numbers $u_{1},u_{2},\cdots,u_{m}$ and compute $$\hat{\theta}=\overline{g_{m}(u)}=1/m\sum_{i=1}^{m}30x^3u_{i}^2(1-xu_{i})^2$$. The sample mean$\hat{\theta}$converges to $E[\hat{\theta}]=\theta$ as $m\to\infty$.
#write a function to compute aMonte Carlo estimate of the Beta(3,3) my_MCestimate=function(n,x){ u=runif(n) #Generate iid Uniform(0,1) random numbers cdf=numeric(length(x)) for(i in 1:length(x)){ g=30*x[i]^3*u^2*(1-u*x[i])^2 cdf[i]=mean(g) #the estimated value of the cdf } phi=pbeta(x,3,3) #the estimated values returned by the pbeta function print(round(rbind(x,cdf,phi),3)) #Compare the estimates with the value F(x) computed (numerically) by the pbeta function. } x=c(1,2,3,4,5,6,7,8,9)/10 n=1000 my_MCestimate(n,x) #the estimated values returned by the my_MCestimate function
From the results, the estimated value by monte carlo method is close to the estimated value from the pbeta function.This shows that we can use monte carlo method to estimate the distribution function directly.
The Rayleigh density [156, (18.76)] is$$f(x)=\frac{x}{\sigma^2}e^\frac{-x^2}{2\sigma^2},x\ge0,\sigma>0$$ Implement a function to generate samples from a $Rayleigh(\sigma)$ distribution,using antithetic variables. What is the percent reduction in variance of $\frac{X+X'}{2}$ compared with$\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$?
Idea: $f(x)=\frac{x}{\sigma^2}e^\frac{-x^2}{2\sigma^2},x\ge0,\sigma>0$,so the cdf is $$F(x)=\int_{0}^{x}\frac{t}{\sigma^2}e^-\frac{t^2}{2\sigma^2},t\geq0,\sigma>0$$.Making the substitution $u = t/x$, we have $dt = xdu$ and $$\theta=\int_{0}^{1}\frac{ux^2}{\sigma^2}e^-\frac{(xu)^2}{2\sigma^2}du$$. Where the random variable U has the $Uniform(0,1)$ distribution.Thus, $$\theta=E_{U}[\frac{Ux^2}{\sigma^2}e^-\frac{(xU)^2}{2\sigma^2}]$$. Generate random numbers $u_{1},\dots,u_{m/2}\sim uniform(0,1)$ and compute half of the replicates using $$Y_{j}=g^{(j)}(u)=\frac{u_{j}x^2}{\sigma^2}e^-\frac{(xu_{j})^2}{2\sigma^2},j=1,2,\dots,m/2$$; $$Y'{j}=\frac{(1-u{j})x^2}{\sigma^2}e^-\frac{(x(1-u_{j})^2}{2\sigma^2},j=1,2,\dots,m/2$$. Thus,$$\hat{\theta}=\overline{g_{m}(u)}=\frac{1}{m}\sum_{j=1}^{m/2}(\frac{u_{j}x^2}{\sigma^2}e^-\frac{(xu_{j})^2}{2\sigma^2}+\frac{(1-u_{j})x^2}{\sigma^2}e^-\frac{(x(1-u_{j})^2}{2\sigma^2})$$.The sample mean$\hat{\theta}$converges to $E[\hat{\theta}]=\theta$ as $m\to\infty$.
#Implement a function to generate samples from a $Rayleigh(\sigma)$ distribution,using antithetic variables. MC.av=function(x, m= 10000, antithetic = TRUE) { u=runif(m/2) #Generate random numbers u(1),u(2),...,u(m/2)~Unifrom(0,1) if (!antithetic) v=runif(m/2) else v=1-u u=c(u, v) sigma=10 cdf=numeric(length(x)) for (i in 1:length(x)) { g=u*x[i]^2/(sigma^2)* exp(-(u * x[i])^2/2*(sigma)^2) cdf[i]=mean(g) #the estimated value of the cdf } cdf } x=seq(1.0,5.0,length=10) set.seed(123) MC1=MC.av(x,anti=FALSE) ##generate samples from the distribution without using antithetic variables set.seed(123) MC2=MC.av(x) #generate samples from the distribution by using antithetic variables print(round(rbind(x,MC1,MC2),10)) c(sd(MC1),sd(MC2)) print((var(MC1)-var(MC2))/var(MC1)) #the percent reduction in variance of $\frac{X+X'}{2}$ compared with$\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$
From the results,we can conclude that the percent reduction in variance of $\frac{X+X'}{2}$ compared with $\frac{X_{1}+X_{2}}{2}$ for independent $X_{1},X_{2}$ is 95.9%.
Find two importance functions $f_{1}$ and $f_{2}$ that are supported on $(1,\infty)$ and are 'close' to $$g(x)=\frac{x^2}{\sqrt{2\pi}}e^-x^2/2,x>1$$. Which of your two importance functions should produce the smaller variance in estimating $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^\frac{-x^2}{2}dx$$ by importance sampling? Explain.
Idea: $g(x)=\frac{x^2}{\sqrt{2\pi}}e^-x^2/2,x>1$,two possible choices of important function to estimate $\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^-x^2/2dx$ by sampling method are compared.The two important functions are $$f1=\frac{1}{\sqrt{2\pi}}e^-\frac{x^2}{2}$$ and $$f2=\frac{1}{2}x^2e^-x$$.
m=10000 theta.hat=se=numeric(2) g=function(x) { ((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1) } x=rnorm(m) #using f1 fg1=g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2)) theta.hat[1]=mean(fg1) #The first estimate se[1]=sd(fg1) #The variance of the first estimate x=rgamma(m,3,1) #using f2 x=x[x>1] #Limit the range of x,let x>1 fg2=g(x)/((1/2)*(x^2)*exp(-x)) theta.hat[2]=mean(fg2) #The second estimate se[2]=sd(fg2) #The variance of the second estimate rbind(theta.hat, se) #Draw the curves of g, f1 and f2, then compare them curve(g,1,5,ylab="",main="The comparsion of the curves of g,f1 and f2") f1=function(x){ (1/sqrt(2*pi))*exp(-(x^2)/2) } curve(f1,1,5,add=TRUE,col="red") f2=function(x){ (1/2)*(x^2)*exp(-x) } curve(f2,1,5,add=TRUE,col="green") #Draw the curves of fg1 and fg2, and compare them fg1=function(x){ g(x)/((1/sqrt(2*pi))*exp(-(x^2)/2)) } curve(fg1,1,5,ylab="",main="The comparsion of the curves of fg1 and fg2") fg2=function(x){ g(x)/((1/2)*(x^2)*exp(-x)) } curve(fg2,1,5,add=TRUE,col="red")
From the results,we can conclude that the variance of $f1=\frac{1}{\sqrt{2\pi}}e^-\frac{x^2}{2}$ produced is 1.1691 and the variance of $f2=\frac{1}{2}x^2e^-x$ produced is 0.4635,so the $f2=\frac{1}{2}x^2e^-x$ produce the smaller variance in esmating the integration.From the graphs,we can conclude that the function that corresponds to the most nearly constant ratio $g(x)/f(x)$ appears to be f2.
Obtain a Monte Carlo estimate of $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^\frac{-x^2}{2}dx$$ by importance sampling.
Idea: From the question 3(exercise 5.13),we know that the $f2=\frac{1}{2}x^2e^-x$ is the more appropriate important function.So,in this question,we can use f2 to estimate tht integration.
m=10000 theta.hat=se=numeric(1) g=function(x) { ((x^2)*exp(-(x^2)/2))/sqrt(2*pi) * (x>1) } x=rgamma(m,3,1) #using f x=x[x>1] #Limit the range of x,let x>1 fg=g(x)/((1/2)*(x^2)*exp(-x)) theta.hat[1]=mean(fg) #The second estimate se[1]=sd(fg) #The variance of the second estimate rbind(theta.hat, se)
From the results,we can conclude that the estimate is 0.4313,and I have computed the real value is 0.4006.This result shows that the estimated effect of f2 is good.
Let $X$ be a non-negative random variable with $\mu=E[X]<\infty$. For a random sample $x_{1},\dots,x_{n}$ from the distribution of $X$, the Gini ratio is defined by $$G=\frac{1}{2n^2\mu}\sum_{j=1}^{n}\sum_{i=1}^{n}|x_{i}-x_{j}|.$$
The Gini ratio is applied in economics to measure inequality in income dis- tribution (see e.g. [163]). Note that $G$ can be written in terms of the order statistics $x_{(i)}$ as $$\hat{G}=\frac{1}{n^2\mu}\sum_{i=1}^{n}(2i-n-1)x_{(i)}.$$
If the mean is unknown, let $\hat{G}$ be the statistic G with $\mu$ replaced by $\overline{x}$. Estimate by simulation the mean, median and deciles of $\hat{G}$ if $X$ is standard lognormal. Repeat the procedure for the uniform distribution and $Bernoulli(0.1)$. Also construct density histograms of the replicates in each case.
Idea:
Generate the replicates $\hat{G}^{(i)},i=1,\dots,n$ when $x$ is standard lognormal,the uniform distribution or $Bernoulli(0.1)$ by repeating:
(1)Generate $x^{(i)}{1},\dots,x^{(i)}{n}$ iid from the distribution of x.
(2)sort $x^{(i)}{1},\dots,x^{(i)}{n}$ in increaing order,to obtain $x^{(i)}{(1)}\leq\dots\leq x^{(i)}{(n)}$.
(3)compute $\hat{G}^{(i)}=\frac{1}{n^2\mu}\sum_{j=1}^{n}(2j-n-1)x^{(i)}_{(j)}$.
Estimate by simulation the mean, median and deciles of $\hat{G}$ if $X$ is standard lognormal,the uniform distribution and $Bernoulli(0.1)$. And construct density histograms of the replicates in each case.
#the number of test n=1000 #estimate the G n=1000 G.hat=matrix(0,nrow=3,ncol=n) mean.G.hat=numeric(3) median.G.hat=numeric(3) deciles.G.hat=numeric(3) x=matrix(0,nrow=3,ncol=n) mu.hat=matrix(0,nrow=3,ncol=n) t=numeric(n) for(k in 1:3){ for(i in 1:n){ # i reprent the i-th test x[1,1:n]=sort(rlnorm(n)) #generated the random number of x1,...,xn from the standard lognormal distribution,and sort them x[2,1:n]=sort(runif(n)) #generated the random number of x1,...,xn from the uniform distribution,and sort them x[3,1:n]=sort(rbinom(n,1,0.1)) #generated the random number of x1,...,xn from the Bernoulli(0.1),and sort them mu.hat[k,i]=sum(x[k,1:n])/n #mu replaced by the mean of x for(j in 1:n){ t[j]=(2*j-n-1)*x[k,j] } G.hat[k,i]=sum(t[1:n])/(mu.hat[k,i]*(n^2)) #the estimate of G from the k-th distribution } } #Estimate the mean,median and deciles of G.hat if X is standard lognorm,uniform distribution and benoulli(0.1) mean.G.hat=apply(G.hat,1,mean) #the mean of G.hat median.G.hat=apply(G.hat,1,median) #the median of G.hat deciles.G.hat[1]=quantile(G.hat[1,1:n],0.1) #the deciles of G.hat deciles.G.hat[2]=quantile(G.hat[2,1:n],0.1) deciles.G.hat[3]=quantile(G.hat[3,1:n],0.1) round(rbind(mean.G.hat,median.G.hat,deciles.G.hat),3) #output the mean,median and deciles of G.hat when x is standard lognormal,the uniform distribution and Bernoulli(0.1) #construct the density histograms of the replicates hist(G.hat[1,1:n],main="density histogram of G.hat when x is standard lognormal") hist(G.hat[2,1:n],main="density histogram of G.hat when x is uniform distribution") hist(G.hat[3,1:n],main="density histogram of G.hat when x is Bernoulli(0.1)")
From the results,we can know that the mean,median and deciles of G.hat is approximately equal under the same distribution of x.But when x is different distribution,the value is different too.
Construct an approximate 95% confidence interval for the Gini ratio $\gamma=E(G)$ if $X$ is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.
Idea:
(1)Firstly,assume the $\gamma \sim norm(E(\gamma),var(\gamma))$,use the method of Exercise 6.9 to estimate $E(\hat{G})$ and $Var(\hat{G})$,and let $E(\hat{G})$ and $Var(\hat{G})$ replace $E(\gamma),var(\gamma)$.
(2)construct a function to return the 95% confidence interval of gamma
(3)Then,repeat the estimation of confidence interval procedure 100 times,compute the times $m$ of the true $\gamma$ is in the confidence interval
(4)Lastly,the coverage rate $CP=m/100$
#construct a function to return the 95% confidence interval of gamma CI=function(n,a,b,alpha=0.05){ t=numeric(n) g=numeric(n) for(i in 1:n){ x=sort(rlnorm(n,a,b)) mu.hat=mean(x) for(j in 1:n){ t[j]=(2*j-n-1)*x[j] } g[i]=mean(t)/(n*mu.hat) } gamma.hat=mean(g) sd.hat=sqrt(sum((g-mean(g))^2))/n #estimate the standard deviation of gamma.hat c=qnorm(1-alpha/2,0,1)*sd.hat CI1=gamma.hat-c #the confidence lower limit of gamma CI2=gamma.hat+c #the confidence upper limit of gamma c(CI1,CI2) } CI(1000,1,1,0.05) #construct an approximate 95% confidence interval of gamma when the parameters are given #assess the coverage rate d=matrix(0,nrow=100,ncol=2) for(k in 1:100){ d[k,1:2]=CI(1000,1,1,0.05) } CI1=d[1:100,1] CI2=d[1:100,2] #simulate the true value of gamma gamma=2*pnorm(1/sqrt(2),0,1)-1 #I get this value in an established paper gamma m=0 for(i in 1:100){ if(gamma<CI1[i]) m=m else{ if(gamma>CI2[i]) m=m else{ m=m+1 } } } m CP=m/100 CP #the coverage rate
n=1000 mu=0 sigma=0.9 repsig=400 repmean=100 system.time({ G=function(x,mu=mean(x),n=length(x))sort(x)%*%(1:n*2-n-1)/n/n/mu mG=function(sig){mu=mean(rlnorm(repmean*n,0,sig));mean(sapply(1:repmean,function(...){G(rlnorm(n,0,sig),mu)}))} X=rlnorm(n,mu,sigma) print((CI_sig=sqrt(var(log(X))/qchisq(c(0.9875,0.0125),df=n-1)*(n-1)))) rsig=sqrt(var(log(X))/qchisq(runif(repsig,0.0125,0.9875),df=n-1)*(n-1)) Gs=sapply(rsig,mG) print(quantile(Gs,c(0.0125,0.9875))) })
From the results,we can see that the coverage rate CP is 0.43 when conduct 100 times experiment.The CP is visibly smaller than the nominal level,then it is said to be liberal.
Tests for association based on Pearson product moment correlation $\rho$, Spearman???s rank correlation coefficient $\rho_{s}$, or Kendall???s coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_{s}$ or $\tau$ are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X,Y ) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.
library(MASS) Sigma=matrix(c(1,2,1,9),2,2) A=mvrnorm(n=1000, rep(0, 2), Sigma) #the sampled distribution is bivariate normal plot(A[1:1000,1],A[1:1000,2]) cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="pearson",conf.level=0.95) cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="kendall",conf.level=0.95) cor.test(A[1:1000,1],A[1:1000,2],alternative="two.sided",method="spearman",conf.level=0.95) #compute the power power=matrix(0,nrow=20,ncol=3) for(i in 1:20) { pvalues <- replicate(100, expr = { Sigma=matrix(c(1,2,1,9),2,2) A=mvrnorm(n=100, rep(0, 2), Sigma) ctest=cor.test(A[1:100,1],A[1:100,2],alternative="two.sided",method=c("pearson","kendall","spearman"),conf.level=0.95) ctest$p.value } ) power[i,1:3]=mean(pvalues<=0.05) }
Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.
Idea:
Compute a jackknife estimate of the bias and the standard error of the corre- lation statistic in Example 7.2 as follows:
(1)Let $x=(x_{1},\dots,x_{n})$ be an observed random sample, and idefine the $i$th jackknife sample $x_{(i)}$ to be the subset of $x$ that leaves out the $i^{th}$ observation $x_{i}$. That is,$$x_{(i)}=(x_{1},\dots,x_{i},x_{i+1},\dots,x_{n})$$.
(2)$\hat{\theta}=cor(LAST,GPA)$,define the $i^{th}$ jackknife replicate $\hat{\theta}=cor(LAST[-i],GPA[-i])$.
(3)The jackknife estimate of bias is $$\hat{bias}{jack}=(n-1)(\overline{\hat{\theta{(.)}}}-\hat{\theta})$$. where $\hat{\theta_{(.)}}=\frac{1}{n}\sum_{i=1}^{n}\hat{\theta}_{i}$.
(4)A jackknife estimate of standard error is $$\hat{se}{jack}=\sqrt{\frac{n-1}{n}\sum{i=1}^{n}(\hat{\theta}{(i)}-\overline{\hat{\theta{(.)}}})^2}$$.
set.seed(1) library(bootstrap) #for the law data LSAT=law$LSAT GPA=law$GPA n=nrow(law) system.time({ theta.hat=cor(LSAT,GPA) #the correlation estimate print(theta.hat) #compute the jackknife replicates, leave-one-out estimates theta.jack=numeric(n) for(i in 1:n){ theta.jack[i]=cor(LSAT[-i],GPA[-i]) } bias=(n-1)*(mean(theta.jack)-theta.hat) print(bias) #a jackknife estimate of the bias of the correlation statistic se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2)) print(se) #a jackknife estimate of the standard error of the correlation statistic })
The jackknife estimate of the bias and the standard error of the correlation statistic are -0.006473623 and 0.1425186.
Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures $1/\lambda$ by the standard normal, basic, percentile, and BCa methods. Compare the intervals and explain why they may differ.
set.seed(1) library(boot) #for boot and boot.ci n=100 #the times of bootstrap replicates system.time({ x=c(3,5,7,18,43,85,91,98,100,130,230,487) boot.mean=function(x,i) mean(x[i]) mean.hat=mean(x) boot.obj=boot(x,statistic = boot.mean,R=n) print(boot.ci(boot.obj,type=c("norm","basic","perc","bca"))) })
From the results,we can see that the 95% bootstrap confidence intervals for the mean time between failures by the standard normal, basic,percentile, and BCa methods respectively are $( 30.6, 187.0 ), ( 4.4, 172.2 ), ( 43.9, 211.7 ), and ( 53.9, 226.3 ).$
Because the BCa confidence intervals are transformation respecting and BCa intervals have second order accuracy, the bootstrap t confidence interval is second order accurate but not transformation respecting, the bootstrap percentile interval is transformation respecting but only first order accurate, the standard normal confidence interval is neither transformation respecting nor second order accurate, so the intervals may differ.Hence, the larger the order of accuracy, the faster the error is reduced.So the Bca method is more accurate.
Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat{\theta}$.
Idea:
(1)the MLE of covariance is $$\hat{\Sigma}=\frac{1}{n-1}(X-\mu)(X-\mu)^T,$$ where the $X=scor^T=(x_{1},\dots,x_{n})$ is the sample matrix, ,$x_{i}=(x_{i1},\dots,x_{i5})^T$,$\mu=(\mu_{1},\dots,\mu_{n}),n=1,\dots,88$, where $\mu_{i}$ is the rowmeans of $scor^T$.
(2)obtain the jackknife estimates,first and idefine the $i$th jackknife sample $x_{(i)}$ to be the subset of $x$ that leaves out the $i^{th}$ observation $x_{i}$. That is,$$Y_{i}=(x_{1},\dots,x_{-1},x_{i+1},\dots,x_{n})$$.
(3)$\hat{\Sigma}=\frac{1}{n-1}(X-\mu)(X-\mu)^T$,define the $i^{th}$ jackknife replicate $\hat{\Sigma}=\frac{1}{n-1}(Y-\mu)(Y-\mu)^T$.
(4)Then,compute the eigenvalues of Sigma.hat under $\hat{\Sigma}$,$\hat{\lambda}{1}>\dots>\hat{\lambda}{5}$,then $\hat{\theta}=\frac{\hat{\lambda}{1}}{\sum{j=1}^{j=5}\hat{\lambda}_{j}}$.
(5)Then,compute the eigenvalues of Sigma.jack under $\hat{\Sigma}{jack}$,then compute the $\hat{\theta}{jack}$.
(6)The jackknife estimate of bias is $$\hat{bias}{jack}=(n-1)(\overline{\hat{\theta{(.)}}}-\hat{\theta})$$. where $\hat{\theta_{(.)}}=\frac{1}{n}\sum_{i=1}^{n}\hat{\theta}_{jack(i)}$.
(7)A jackknife estimate of standard error is $$\hat{se}{jack}=\sqrt{\frac{n-1}{n}\sum{i=1}^{n}(\hat{\theta}{(i)}-\overline{\hat{\theta{(.)}}})^2}$$.
set.seed(1) data(scor, package = "bootstrap") #for the score data library(MASS) scor.m= matrix(as.numeric(t(scor)), 5, 88) n=nrow(scor) #sample size mu=rowMeans(scor.m) mu0=matrix(mu,nrow=5,ncol=n) system.time({ sigma.hat=((scor.m-mu0)%*%t(scor.m-mu0))/(n-1) # the MLE of covariance matrix ev.hat=sort(eigen(sigma.hat)$values,decreasing=T) theta.hat=ev.hat[1]/sum(ev.hat) theta.hat #the estimate of theta #obtain the jackknife estimates of bias and standard error of theta.hat theta.jack=numeric(n) for(i in 1:n){ scor.m.jack= scor.m[,-i] #compute the jackknife replicates, leave-one-out estimates mu1=matrix(mu,nrow=5,ncol=n-1) sigma.jack=((scor.m.jack-mu1)%*%t(scor.m.jack-mu1))/(n-1) ev.jack=sort(eigen(sigma.jack)$values,decreasing=T) theta.jack[i]=ev.jack[1]/sum(ev.jack) } theta.jack bias=(n-1)*(mean(theta.jack)-theta.hat) print(bias) #a jackknife estimate of the bias se=sqrt((n-1) *mean((theta.jack - mean(theta.jack))^2)) print(se) #a jackknife estimate of the standard error })
The jackknife estimates of bias and standard error of $\hat{\theta}$ are 0.001044413 and 0.04897464.
In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.
Idea:
(1)Leave-two-out cross-validation
Leave-two-out cross-validation involves using 2 observations as the validation set and the remaining $n-2$ observations as the training set. This is repeated on all ways to cut the original sample on a validation set of p observations and a training set. LtO cross-validation requires training and validating the model $m=c^{n}{2}$ times, where n is the number of observations in the original sample, and where $m=c^{n}{2}$ is the binomial coefficient.
(2)Let $y=magnetic,x=chemical$, For $k = 1,\dots,n$, let observation $(x_{i},y_{i}),(x_{j},y_{j})$ be the test point and use the remaining observations to fit the model.
(a) Fit the model(s) using only the $n ??? 2$ observations in the training set
(b) Compute the predicted response $$\hat{y}{i}=\hat{\beta}{0}+\hat{\beta}{1}x_{i}$$, $$\hat{y}{j}=\hat{\beta}{0}+\hat{\beta}_{1}x{j}$$ for the test point.
(c) Compute the squard prediction error $spe[k]=\frac{(y_{i}-\hat{y}{i})^2+(y{j}-\hat{y}_{j})^2}{2}$ .
(3)Estimate the mean of the squared prediction errors $\hat{\sigma}^2=\overline{spe}$
set.seed(1) library(DAAG) attach(ironslag) n=length(magnetic) #in DAAG ironslag m=n*(n-1)/2 # leave-two-out cross validation require training and validating the model m times spe1=spe2=spe3=spe4=numeric(m) # for leave-two-out cross validation # fit models on leave-two-out samples k=0 for(i in 1:(n-1)){ for(j in (i+1):n){ k=k+1 y=magnetic[-c(i,j)] #leave-two-out x=chemical[-c(i,j)] J1=lm(y ~ x) yhat11=J1$coef[1] + J1$coef[2] * chemical[i] #the estimate value of the i-th value yhat12=J1$coef[1] + J1$coef[2] * chemical[j] #the estimate value of the j-th value spe1[k]=((magnetic[i]-yhat11)^2+(magnetic[j]-yhat12)^2)/2 #the squared prediction error of the k-th J2=lm(y ~ x + I(x^2)) yhat21=J2$coef[1] + J2$coef[2] * chemical[i] +J2$coef[3] * chemical[i]^2 yhat22=J2$coef[1] + J2$coef[2] * chemical[j] +J2$coef[3] * chemical[j]^2 spe2[k]=((magnetic[i]-yhat21)^2+(magnetic[j]-yhat22)^2)/2 J3=lm(log(y) ~ x) logyhat31 =J3$coef[1] + J3$coef[2] * chemical[i] logyhat32 =J3$coef[1] + J3$coef[2] * chemical[j] yhat31=exp(logyhat31) yhat32=exp(logyhat32) spe3[k]=((magnetic[i]-yhat31)^2+(magnetic[j]-yhat32)^2)/2 J4=lm(log(y) ~ log(x)) logyhat41 =J4$coef[1] + J4$coef[2] * log(chemical[i]) logyhat42 =J4$coef[1] + J4$coef[2] * log(chemical[j]) yhat41=exp(logyhat41) yhat42=exp(logyhat42) spe4[k]=((magnetic[i]-yhat41)^2+(magnetic[j]-yhat42)^2)/2 } } c(mean(spe1),mean(spe2),mean(spe3),mean(spe4)) #According to the prediction error criterion, Model 2, the quadratic model, would be the best fit for the data. J2 par(mfrow = c(1, 2)) #layout for graphs plot(J2$fit, J2$res) #residuals vs fitted values abline(0, 0) #reference line qqnorm(J2$res) #normal probability plot qqline(J2$res) #reference line par(mfrow = c(1, 1)) #restore display
According to the prediction error criterion, Model 2, the quadratic model,would be the best fit for the data.Its results are consistent with those of the leave-one out method.From the QQ plot, we can see the Model 2 is similar to the sample distribution,
Implement the two-sample Cram´ er-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2.
set.seed(1) library(nortest) chickwts x=sort(as.vector(chickwts$weight[chickwts$feed == "soybean"])) y=sort(as.vector(chickwts$weight[chickwts$feed == "linseed"])) n=length(x) m=length(y) R=999 #number of replicates z=c(x, y) #pooled sample K=1:26 D=numeric(R) #storage for replicates fnx=ecdf(x) #the ecdf of x fny=ecdf(y) #the ecdf of y options(warn = -1) D0=(sum((fnx(x)-fny(x))^2)+sum((fnx(y)-fny(y))^2))/((m*n)/((m+n)^2)) #statistic for (i in 1:R){ k=sample(K,size=n,replace=FALSE) #permutation x1=z[k] y1=z[-k] D[i]=(sum((fnx(x1)-fny(x1))^2)+sum((fnx(y1)-fny(y1))^2))/((m*n)/((m+n)^2)) #statistic } options(warn = 0) p=mean(D>D0) p
From the results,p=0.2712 is significent,so we should reject the $H_{0}$.
Design experiments for evaluating the performance of the NN,energy, and ball methods in various situations.
(1)Unequal variances and equal expectations
(2)Unequal variances and unequal expectations
(3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
(4)Unbalanced samples (say, 1 case versus 10 controls)
Note: The parameters should be choosen such that the powers are distinguishable (say, range from 0.3 to 0.9).
set.seed(1) library(RANN) library(energy) library(Ball) library(boot) #(1) Unequal variances and equal expectations m=40 #number of replicates R=999 k=3 Tn= function(z, ix, sizes,k) { n1=sizes[1] n2 = sizes[2] n =n1 + n2 if(is.vector(z)) z=data.frame(z,0); z =z[ix, ]; NN =nn2(data=z, k=k+1) block1= NN$nn.idx[1:n1,-1] block2=NN$nn.idx[(n1+1):n,-1] i1 =sum(block1 < n1 + .5); i2=sum(block2 > n1+.5) (i1 + i2) / (k * n) } eqdist.nn=function(z,sizes,k){ boot.obj=boot(data=z,statistic=Tn,R=R, sim = "permutation", sizes = sizes,k=k) ts=c(boot.obj$t0,boot.obj$t) p.value=mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p1.values=matrix(NA,m,3) for(i in 1:m){ x=rnorm(10,4,4) #generate a sample x~N(4,4) y=rnorm(10,4,1) #generate a sample y~n(4,1) z=c(x,y) n1=length(x) n2=length(y) N=c(n1,n2) p1.values[i,1]=eqdist.nn(z,N,k)$p.value p1.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value p1.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value } eqdist.nn(z,N,k)$p.value alpha=0.1 pow1=colMeans(p1.values<alpha) pow1 names(pow1)=c("NN", "energy", "ball") barplot(pow1) #(2) Unequal variances and unequal expectations m=40 #number of replicates R=999 k=3 z=c(x,y) # pooled sample eqdist.nn=function(z,sizes,k){ boot.obj = boot(data=z,statistic=Tn,R=R, sim = "permutation", sizes = sizes,k=k) ts=c(boot.obj$t0,boot.obj$t) p.value=mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p2.values=matrix(NA,m,3) for(i in 1:m){ x=rnorm(10,4,4) #generate a sample x~N(4,4) y=rnorm(10,2,1) #generate a sample y~N(2,1) z=c(x,y) n1=length(x) n2=length(y) N=c(n1,n2) p2.values[i,1]=eqdist.nn(z,N,k)$p.value p2.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value p2.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value } alpha=0.1 pow2=colMeans(p2.values<alpha) pow2 names(pow2)=c("NN", "energy", "ball") barplot(pow2) #(3)Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions) m=40 #number of replicates R=999 k=3 z=c(x,y) #poled sample eqdist.nn=function(z,sizes,k){ boot.obj =boot(data=z,statistic=Tn,R=R, sim = "permutation", sizes = sizes,k=k) ts=c(boot.obj$t0,boot.obj$t) p.value=mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p3.values=matrix(NA,m,3) for(i in 1:m){ x=rt(10,1) #generate a sample x~t(1) y=c(rnorm(5,2,1),rnorm(5,4,4)) #generate a bimodel sample y which is a mixture of N(2,1) and N(4,4) z=c(x,y) n1=length(x) n2=length(y) N=c(n1,n2) p3.values[i,1]=eqdist.nn(z,N,k)$p.value p3.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value p3.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value } alpha=0.1 pow3=colMeans(p3.values<alpha) pow3 names(pow3)=c("NN", "energy", "ball") barplot(pow3) #(4)Unbalanced samples (say, 1 case versus 10 controls) m=40 #number of replicates R=999 k=3 z=c(x,y) #pooled sample eqdist.nn =function(z,sizes,k){ boot.obj= boot(data=z,statistic=Tn,R=R, sim = "permutation", sizes = sizes,k=k) ts=c(boot.obj$t0,boot.obj$t) p.value=mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p4.values=matrix(NA,m,3) for(i in 1:m){ x=rexp(10,4) #generate a sample x~exp(4) y=rf(15,1,2) #generate a sample y~F(1,2),and the numbers of them are different z=c(x,y) n1=length(x) n2=length(y) N=c(n1,n2) p4.values[i,1]=eqdist.nn(z,N,k)$p.value p4.values[i,2]=eqdist.etest(z,sizes=N,R=R)$p.value p4.values[i,3]=bd.test(x=x,y=y,R=999,seed=i*12345)$p.value } alpha=0.1 pow4=colMeans(p4.values<alpha) pow4 names(pow4)=c("NN", "energy", "ball") barplot(pow4)
For exercise 9.6, use the Gelman-Rubin method to monitor convergence of the chain, and run the chain until the chain has converged approximately to the target distribution according to $\hat{R}<1.2$.
set.seed(1) Gelman.Rubin=function(psi) { # psi[i,j] is the statistic psi(X[i,1:j]) # for chain in i-th row of X psi =as.matrix(psi) n=ncol(psi) k=nrow(psi) psi.means=rowMeans(psi) #row means B=n * var(psi.means) #between variance est. psi.w=apply(psi, 1, "var") #within variances W=mean(psi.w) #within est. v.hat=W*(n-1)/n + (B/n) #upper variance est. r.hat=v.hat / W #G-R statistic return(r.hat) } f=function(theta,x){ f1=(0.5+theta/4)^x[1] f2=((1-theta)/4)^x[2] f3=((1-theta)/4)^x[3] f4=(theta/4)^x[4] if(theta>=0 && theta<1) return(f1*f2*f3*f4) else{ return(0) } } unif.chain=function(m,p1){ #generates a Metropolis chain for unif(0,1) #with unif(-1,1) proposal distribution #and starting value X1 p=rep(0, m) p[1]=p1 u=runif(m) y=runif(m,-1,1) #the proposal distribution for (i in 2:m) { theta=p[i-1]+y[i] num= f(theta,x) den=f(p[i-1],x) if(u[i]<=num/den) p[i]=theta else{ p[i]=p[i-1] } } return(p) } m=5000 #the number of replicates b=4700 #burn-in length t=4 #number of chains to generate x=c(125,18,20,34) p0=c(0.2,0.4,0.6,0.8) #choose overdispersed initial values #generate the chains P=matrix(0,nrow=t,ncol=m) for(i in 1:t) P[i,]=unif.chain(m,p0[i]) #compute diagnostic statistics psi=t(apply(P, 1, cumsum)) for (i in 1:nrow(psi)) psi[i,]=psi[i,] / (1:ncol(psi)) print(Gelman.Rubin(psi))
From the results, we can see that the $\hat{r}$ is $1.0595<1.2$. It shows that the chains have approximately converged to the target distribution.
Find the intersection points $A(k)$ in $(0,\sqrt{k})$ of the curves $$S_{k-1}(a)=P\left(t(k-1)>\sqrt{\frac{a^{2}(k-1)}{k-a^{2}}}\right)$$ and $$S_{k}(a)=P\left(t(k)>\sqrt{\frac{a^{2}k}{k+1-a^{2}}}\right)$$,for $k=4:25,100,500,1000$, where $t(k)$ is a Student $t$ random variable with $k$ degrees of freedom. (These intersection points determine the critical values for a t-test for scale-mixture errors proposed by Sz´ekely [260].)
set.seed(1) library(rootSolve) k=c(4:25,100,500,1000) #the degrees n=length(k) solution=numeric(n) s=numeric(n) for(i in 1:n){ s1=function(a){ q=sqrt((a^2)*(k[i]-1)/(k[i]-a^2)) pt(q,df=k[i]-1,lower.tail = FALSE,log.p = FALSE) } s2=function(a){ q=sqrt((a^2)*k[i]/(k[i]+1-a^2)) pt(q,df=k[i],lower.tail = FALSE,log.p = FALSE) } f=function(a){ s1(a)-s2(a) } solution[i]=uniroot(f,c(0.0001,sqrt(k[i])-0.0001))$root #the abscissa of the intersection points s[i]=s1(solution[i]) #the ordinate of the intersection points } print(round(rbind(solution,s),5))
From the results ,we can see the abscissa of the intersection points is in 1~2 when $k<500$.When $k=500,1000$,there are infinite intersections.
Write a function to compute the cdf of the Cauchy distribution, which has
density $$\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)},-\infty
set.seed(1) f=function(x,theta,eta){ 1/(theta*pi*(1+((x-eta)/theta)^2)) } res=integrate(f,lower=-Inf,upper=6,rel.tol=.Machine$double.eps^0.25,theta=1,eta=1) print(round(rbind(res$value,pcauchy(6,lower.tail = TRUE)),5))
From the result,we can see the value from this method is 0.93717,the value from the R function pcauchy is 0.94743, they are very colsed.
A-B-O blood type problem
Let the three alleles be A, B and O.
| Genotype | AA | BB | OO | AO | BO | AB | AA | | -------- | -----: | :----: | -----: | :----: | -----: | :----: | -----: | | Frenquency | $p^{2}$ | $q^{2}$ | $r^{2}$ | 2pr | 2qr | 2pq | 1 | | Count | nAA | nBB | nOO | nAO | nBO | nAB | nAA |
Observed data: $$n_{A.}=n_{AA}+n_{AO}=28(A-type)$$, $$n_{B.}=n_{BB}+n_{BO}=24(B-type)$$, $$n_{OO}=41(O-type)$$, $$n_{AB}=70(AB-type)$$.
Use EM algorithm to solve MLE of $p$ and $q$ (consider missing data $n_{AA}$ and $n_{BB}$).
Record the maximum likelihood values in M-steps,are they increasing?
ABOexpect=function(nA,nB,p,q,r) { nAA=nA * p / (p + 2*r) nAO=nA - nAA nBB=nB * q / (q + 2*r) nBO=nB - nBB N=list(AA=nAA,AO=nAO,BB=nBB,BO=nBO) } ABOmaximize=function(nAA,nAO,nBB,nBO,nAB,nO) { p=(2 * nAA + nAO + nAB) / (2 * (nA + nB + nO + nAB)) q=(2 * nBB + nBO + nAB) / (2 * (nA + nB + nO + nAB)) r= 1.0 - p - q L=list(p=p,q=q,r=r) } #initial guess p=0.3 q=0.2 r=0.5 ## Observed data (counts of people with the four blood groups) nA =28 nB=24 nAB=41 nO=40 ## Set up iteration iter=1 diff=1 tol=0.0001 ## tolerance while (diff > tol) { E=ABOexpect(nA,nB,p,q,r) M=ABOmaximize(E$AA,E$AO,E$BB,E$BO,nAB,nO) diff=abs(M$p - p) + abs(M$q - q) + abs(M$r -r) p=M$p q=M$q r=M$r cat(sprintf("iter:%d, diff:%.2f\n",iter,diff)) iter=iter + 1 } cat(sprintf("p=%.4f,q=%.4f, r=%.4f\n",p,q,r))
From the result, we can see the MLE of the $p,q$ are respectivly 0.2847, 0.2649.
Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list:
formulas=list(
mpg~disp,
mpg~I(1/disp),
mpg~disp+wt,
mpg~I(1/disp)+wt
)
set.seed(1) library(base) formulas=list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt ) #for loops out_formulas=vector('list', length(formulas)) for(i in seq_along(formulas)) { out_formulas[[i]]=lm(formulas[[i]], data = mtcars) } out_formulas #lapply lapply(formulas, lm, mtcars)
Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply() . Can you do it without an anonymous function?
bootstraps=lapply(1:10, function(i) {
rows=sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
})
set.seed(1) bootstraps=lapply(1:10, function(i) { rows=sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] } ) #for loops out_bootstraps=vector('list', length(bootstraps)) for(i in seq_along(bootstraps)) { out_bootstraps[[i]]=lm(mpg~disp, data = bootstraps[[i]]) } out_bootstraps #lapply lapply(bootstraps, lm, formula = mpg~disp)
For each model in the previous two exercises, extract $R^{2}$ using the function below.
rsq=function(mod) summary(mod)$r.squared
set.seed(1) rsq=function(mod) summary(mod)$r.squared #Exercise 3 lapply(out_formulas, rsq) #Exercise 4 lapply(out_bootstraps, rsq)
The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.
trials=replicate(100,t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE)
Extra challenge: get rid of the anonymous function by using[[ directly.
set.seed(1) p=numeric(100) trials= replicate(100,t.test(rpois(10, 10), rpois(7, 10)),simplify = FALSE) #for loops for(i in 1:100){ p[i]=trials[[i]]$p.value } p #sapply x=c(1:100) p1=sapply(trials,function(mod) mod$p.value) p1 #anonymous function p2=sapply(trials,'[[','p.value') p2 #compare the consequence of the for loops and sapply identical(p,p1) identical(p,p2)
Implement a combination of Map() and vapply() to create an lapply() variant that iterates in parallel over all of its inputs and stores its outputs in a vector (or a matrix). What arguments should the function take?
set.seed(1) library(parallel) library(foreach) #parallel mcvMap=function(f, FUN.VALUE , ...) { out=Map(f, ...) vapply(out, FUN, FUN.VALUE) } #foreach y=foreach(x=1:1000,.combine='rbind') %do% exp(x)
Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test() or by coding from the mathematical definition ( http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test ).
set.seed(1) options(warn = -1) chisq_test2=function(x, y) { n=sum(x) + sum(y) ni._x=sum(x) ni._y=sum(y) chisq.statistic=0 for (i in seq_along(x)) { n.j=x[i] + y[i] expected_x=(ni._x*n.j)/n expected_y=(ni._y*n.j)/n chisq.statistic=chisq.statistic+((x[i]-expected_x)^2)/expected_x chisq.statistic=chisq.statistic+((y[i]-expected_y)^2)/expected_y } chisq.statistic } x=c(3,8,4,9) y=c(6,9,3,5) a=matrix(c(x,y),4,2) print(chisq_test2(x,y)) print(chisq.test(a)$statistic) #compare the computation time of the two functions microbenchmark::microbenchmark(chisq_test2(x,y),chisq.test(a))
From the results,we can see the function chisq_test2 is mucher faster than the function chisq.test.
Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?
set.seed(1) x=c(3,8,4,9) y=c(6,9,3,5) b=table(x,y) summary(b)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.