x<-seq(0,2*pi,length=100) y1<-sin(x) y2<-cos(x) plot(x,y1,type="l",lwd=2,ylab="y") lines(x,y2,lwd=2,lty=2) abline(h=0,lwd=2) abline(v=0,lwd=2)
set.seed(1958); y<-rnorm(100) hist(y,freq=FALSE) res<-density(y); lines(res,col="red")
library(lattice) data(quakes); mini <- min(quakes$depth) ; maxi <- max(quakes$depth) ; int <- ceiling((maxi - mini)/9); inf <- seq(mini, maxi, int) ; quakes$depth.cat <- factor(floor(((quakes$depth - mini) / int)), labels=paste(inf, inf + int, sep="-")) ; xyplot(lat ~ long | depth.cat, data = quakes)
data("pressure"); names(pressure) lm.summary<-lm(temperature~pressure,data=pressure) plot(x=pressure$pressure,y=pressure$temperature,xlab="pressure",ylab="temperature") plot(lm.summary)
cells<-c(1,16,24,68) rnames<-c("R1","R2") cnames<-c("C1","C2") mymatrix<-matrix(cells,nrow=2,ncol=2,byrow=TRUE,dimnames = list(rnames,cnames)) mymatrix
title: "Homework-18003-2018/09/21" author: "A18003" date: "2018/09/21" output: html_document
Idea:
1.Generate U~U(0,1)
2.If $F_X$($x_{i-1}$)$<$U$\le$$F_X$($x_{i}$),Then X=$x_i$
set.seed(2) x<-c(0,1,2,3,4); p<-c(0.1,0.2,0.2,0.2,0.3); cp<-cumsum(p); m<-10^3; r<-numeric(m); r<-x[findInterval(runif(m),cp,left.open = TRUE)+1]#"left.open=TRUE,because in findInterval function ,the intervals are open at right and closed at left. r ct<-as.vector(table(r));#calulate frequency of the table "r" ct/sum(ct);
Idea:
1.$$Generate\quad random\quad numbers\quad U\sim U(0,1)\quad and\quad Y\sim g(.)$$
2.$$If\quad U\le\rho (Y)\quad then\quad accept\quad Y\quad and\quad stop(return X=Y) \quad\rho(x)=\frac{f(x)}{cg(x)}$$
3.$$beta(\alpha,\beta) \quad with\quad pdf\quad f(x)=\frac{x^{\alpha-1}(1-x)^{\beta-1}}{B(\alpha,\beta)}.
\g(x)=1,0<x<1 \quad and\quad c=\frac{1}{B(\alpha,\beta)} $$
rnbeta<-function(a,b,N){ n<-N; j<-k<-0; y<-numeric(n); while(k<n){ u<-runif(1); j<-j+1; x<-runif(1) #random variate from g if(x^(a-1)*(1-x)^(b-1)>u){ #we aceept x k<-k+1; y[k]<-x;} } return(y)}; x<-rnbeta(3,2,1000); p1 = hist(x,plot = T,freq = FALSE,breaks=seq(0,1,0.01),col="yellow",main = "a-r menthod for generating random number"); d=seq(from=min(x),to=max(x),by=0.01) lines(x=d,y=dbeta(d,3,2),lty=2,col="red")#add density curve
Question:Simulate a continuous Exponential-Gamma mixtrue
n<-1000; r<-4; beta<-3; lambda<-rgamma(n,r,beta); x<-rexp(n,lambda); hist(x,plot=T,freq=FALSE,breaks=50,col="skyblue",main="Exponential-Gamma mixture") lines(density(x,bw=1),col='red',lty=2)
title: "Homework-18003-2018/9/28" author: "A18003" date: "2018/9/28" output: html_document
Write a function to compute a Monte Carlo estimate of the Beta(3, 3) cdf,and use the function to estimate F(x) for x = 0.1, 0.2, . . ., 0.9. Compare the
estimates with the values returned by the pbeta function in R
$$\Phi(x)=\int^{x}{0}30t^2(1-t)^2dt\let\,\,y=\frac{t}{x},we\,have\,dt=xdy\,\,\,then\\theta=\int^{1}{0}30x^3y^2(1-xy)^2dy$$
x<-seq(0.1,0.9,length = 9) m<-10000 set.seed(111) u<-runif(m) cdf <- numeric(length(x)) for (i in 1:length(x)) { g<-30*(x[i])^3 *u^2*(1-x[i]*u)^2 cdf[i]<-mean(g) } Phi <- pbeta(x,3,3) print(round(rbind(x, cdf, Phi), 3))
The Rayleigh density is $$f(x)=\frac{x}{\sigma^2}e^{-x^2/(2\sigma^2)}\,, x \,\, 0, \sigma > 0.\ \Phi(x)=\int_{0}^{x}\frac{t}{\sigma^2}e^{-t^2/(2\sigma^2)}dt\let\,\,y=\frac{t}{x}??we\,have\,dt=xdy\,\,\,then\\theta=\int_{0}^{1}\frac{xy}{\sigma^2}e^{-(xy)^2/(2\sigma^2)}xdy$$ Implement a function to generate samples from a Rayleigh(??) distribution, using antithetic variables. What is the percent reduction in variance of $\frac{X+X'}{2}$ compared with $\frac{X_1+X_2}{2}$ for independent $X_1, X_2$?
MC.Phi<-function(x,sigma,R= 10000, antithetic = TRUE) { u <- runif(R/2) if (!antithetic) v<- runif(R/2) else v<-1-u w<-c(u, v) cdf <- numeric(length(x)) for (i in 1:length(x)) { g <-x[i]*w/(sigma)^2*exp((-(x[i]*w)^2/(2*sigma*sigma)))*x[i] cdf[i] <- mean(g) } return(cdf) } x <- seq(.1,2.5,length=5); pRayleigh<-function(x,sigma){ s<-sigma; p<-numeric(length(x)); intergrand<-function(x){ x/(s^2)*exp((-x^2/(2*(s^2))))}; for(i in 1:length(x)){ p[i]<-integrate(intergrand,0,x[i])$value; } return(p) } Phi<-pRayleigh(x,sigma=2) set.seed(123) MC1<- MC.Phi(x,sigma=2,anti=FALSE) #for (X1+X2)/2 which X1,X2 is independent set.seed(123) MC2<- MC.Phi(x,sigma=2,anti=TRUE) #for antithetic variables (X+X')/2 print(round(rbind(x, MC1, MC2, Phi),5)) m <- 1000 MC1 <- MC2 <- numeric(m) x <- 1.95 for (i in 1:m) { MC1[i] <- MC.Phi(x,2,R = 1000, anti = FALSE) MC2[i] <- MC.Phi(x,2,R = 1000,anti=TRUE) } print(sd(MC1)) print(sd(MC2)) print((var(MC1) - var(MC2))/var(MC1))
The antithetic variable approach achieved approximately 91.9% reduction in variance at x = 1.95.and $\sigma=2$
Find two importance functions f1 and f2 that are supported on $(1,\infty )$ and
are ??close?? to
$$g(x)=\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx\quad x > 1.$$
Which of your two importance functions should produce the smaller variance
in estimating
$$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx$$
by importance sampling? Explain.
Solve:
let $f_1(x)=\frac{1}{\sqrt{2\pi}}e^{-x^2/2}\quad -\infty<x<+\infty$
$\quad f_2(x)=e^{-x+1}\quad 1<x<+\infty$
x<-seq(1,10,length=100) y<-1/sqrt(2*pi)*exp(-x^2/2); z<-exp(-x+1) plot(x,y,col="red",lty=1,xlim = c(0,10)) lines(x,z,col="green",lty=1,xlim=c(0,10)) y1<-x^2; z1<-y1*y/z; plot(x,y1,col="red",lty=1,xlim = c(0,10)) lines(x,z1,col="green",lty=1,xlim=c(0,10))
we can see both f1 and f2 are close to g
Obtain a Monte Carlo estimate of $$\int_{1}^{\infty}\frac{x^2}{\sqrt{2\pi}}e^{-x^2/2}dx$$ by importance sampling.
m <- 10000 set.seed(111) theta.hat <- se <- numeric(2) g <- function(x) { exp(-x^2/2)*x^2/(sqrt(2*pi))*(x >1) } x <- rnorm(m) #using f1 fg <- g(x)/dnorm(x) theta.hat[1] <- mean(fg) se[1] <- sd(fg) set.seed(222) u <- runif(m) #f2, inverse transform method x <- 1- log(1 - u) fg <- g(x) / (exp(-x+1)) theta.hat[2] <- mean(fg) se[2] <- sd(fg) print(rbind(theta.hat, se))
title: "Homework-18003-2018/10/12" author: "A18003" date: "2018/10/12" output: html_document
Let X be a non-negative random variable with$\mu =E[X] < \infty$. For a random sample$x_1,...,x_n$from the distribution of X, the Gini ratio is defined by $$G=\frac{1}{2n^2\mu}\sum^{n}{j=1}\sum^{n}{i=1}|x_i-x_j|$$ The Gini ratio is applied in economics to measure inequality in income distribution.Note that G can be written in terms of the order statistics$x_{(i)}$as $$G=\frac{1}{n^2\mu}\sum^{n}{i=1}(2i-n-1)x{(i)}.$$ f the mean is unknown, let $\hat G$ be the statistic G with $\mu$ replaced by$\bar x$. Estimate by simulation the mean, median and deciles of $\hat G$ if X is standard lognormal. Repeat the procedure for the uniform distribution and Bernoulli(0.1). Also construct density histograms of the replicates in each case.
n=50 m=1000 G_sam1<-numeric(m) G_sam2<-numeric(m) G_sam3<-numeric(m) set.seed(110) for(i in 1:m) { x<-sort(rlnorm(n,0,1)) y<-sort(runif(n,0,1)) z<-sort(rbinom(n,1,0.5)) J=2*seq(1:n)-n-1 G_sam1[i]=(J%*%x)/(n^2*mean(x)) G_sam2[i]=(J%*%y)/(n^2*mean(y)) G_sam3[i]=(J%*%z)/(n^2*mean(z)) } log_norm<-c(mean(G_sam1),median(G_sam1),quantile(G_sam1,seq(0,1,0.1))) unif_norm<-c(mean(G_sam2),median(G_sam1),quantile(G_sam2,seq(0,1,0.1))) Bern_norm<-c(mean(G_sam2),median(G_sam1),quantile(G_sam3,seq(0,1,0.1))) A=rbind(log_norm,unif_norm,Bern_norm) colnames(A)=c("mean","median",names(quantile(G_sam1,seq(0,1,0.1)))) print(A) hist(G_sam1,freq=FALSE,col="skyblue",prob=TRUE,main="G density histograms for lognorm") hist(G_sam2,freq=FALSE,col="skyblue",prob=TRUE,main="G density histograms for uniform") hist(G_sam3,freq=FALSE,col="skyblue",prob=TRUE,main="G density histograms for Bernoulli")
Construct an approximate 95% confidence interval for the Gini ratio $\gamma = E[G]$ if X is lognormal with unknown parameters. Assess the coverage rate of the estimation procedure with a Monte Carlo experiment.
n=20 m=100 G_sam1<-numeric(m) alpha=0.025 UCL_low<-numeric(1000) UCL_upp<-numeric(1000) Mean<-numeric(1000) for(k in 1:1000) { for(i in 1:m) #generate m replitcates { x<-sort(rlnorm(n,0,1)) J=2*seq(1:n)-n-1 G_sam1[i]=(J%*%x)/(n^2*mean(x)) } UCL_low[k]<- mean(G_sam1)-qnorm(1-alpha)*sd(G_sam1)/sqrt(m) UCL_upp[k]<- mean(G_sam1)+qnorm(1-alpha)*sd(G_sam1)/sqrt(m) Mean[k]<-mean(G_sam1) } fin_mean<-mean(Mean) k=sum(UCL_low<fin_mean &UCL_upp>fin_mean) k/1000
Tests for association based on Pearson product moment correlation $\rho$, Spearman??s rank correlation coefficient $\rho_s$, or Kendall??s coefficient $\tau$, are implemented in cor.test. Show (empirically) that the nonparametric tests based on $\rho_s$ or $\tau$, are less powerful than the correlation test when the sampled distribution is bivariate normal. Find an example of an alternative (a bivariate distribution (X, Y ) such that X and Y are dependent) such that at least one of the nonparametric tests have better empirical power than the correlation test against this alternative.
library(MASS) mu0<-c(1,1) n<-20 m <- 1000 ptest<-stest<-ktest<-numeric(m) pho<-seq(0.1,0.9,length=21) #alternatives power1 <- numeric(21) power2 <- numeric(21) power3 <- numeric(21) set.seed(122) for (i in 1:21) { sigma_i <-matrix(c(1,0.1+0.04*(i-1),0.1+0.04*(i-1),1),nrow = 2) for(k in 1: m){ x <- mvrnorm(n,mu0,sigma_i) ptest[k]<- cor.test(x[,1],x[,2],alternative = "greater",method = "pearson")$p.value stest[k]<-cor.test(x[,1],x[,2],alternative = "greater",method = "spearman")$p.value ktest[k]<-cor.test(x[,1],x[,2],alternative = "greater",method = "kendall")$p.value } power1[i] <- mean(ptest<=0.05) power2[i] <- mean(stest<=0.05) power3[i] <- mean(ktest<=0.05) } plot(pho, power1,type="l",col="blue") lines(pho,power2,type="l",col="red") lines(pho,power3,type="l",col="yellow")
for X Y is independent
library(MASS) m=1000 n=20 mean<-c(1,1) sigma=matrix(c(1,0,0,1),2,2) p_pearson<-numeric(m) p_Spearman<-numeric(m) p_Kendall<-numeric(m) for(i in 1:m) { sample=mvrnorm(n,mean,sigma) x=sample[,1] y=sample[,2] test1<-cor.test(x,y,method="pearson") test2<-cor.test(x,y,method="kendall") test3<-cor.test(x,y,method="spearman") p_pearson[i]<-test1$p.value p_Spearman[i]<-test2$p.value p_Kendall[i]<-test3$p.value } ratio=c(mean(p_pearson<0.05),mean(p_Spearman<0.05),mean(p_Kendall<0.05)) names(ratio)<-c("peason","Speqrman","Kendall") print(ratio)
title: "Homework-2018.11.02" author: "A18003" date: "2018/11/02" output: html_document
Compute a jackknife estimate of the bias and the standard error of the correlation statistic in Example 7.2.
library(bootstrap); attach(law);#attach data n = length( LSAT ) R.hat = cor( LSAT,GPA ) R.jack = numeric ( n ) for (i in 1:n) { R.jack[i] = cor( LSAT[-i],GPA[-i] ) } #jackknife bias.jack = (n-1)*( mean(R.jack) - R.hat ) R.bar = mean(R.jack) se.jack = sqrt( (n-1)*mean( (R.jack-R.bar)^2 ) )
we find r bias.jack for the bias and r se.jack for std.error
Refer to Exercise 7.4. Compute 95% bootstrap confidence intervals for the mean time between failures$\frac{1}{\lambda} $by the standard normal, basic, percentile, and BCa methods. Compare the intervals and explain why they may differ.
require(boot); attach(aircondit) x=hours gaptime.hat = mean(x) #MLE of 1/lambda B = 5000 #no. boostrap resamples set.seed(5) exc75.boot = boot( data=aircondit,statistic=function(x,i){mean(x[i,])}, R=B )
The bootstrap summary reports:
exc75.boot
we can see the MLE is $\frac{1}{\hat \lambda}$=r exc75.boot$t0,with estimated std.error of$\hat{se}_{boot}$=37.86823
when we calculate the confidence interval by BCa,we need the empirical influence values of the statistic of interest for the observed data.
So
einf.jack = empinf(exc75.boot, type='jack')#where the call to empinf() is used to generate an empirical influence object using the usual jackknife. boot.ci(exc75.boot, type=c('norm','basic','perc','bca'), L=einf.jack)
We see the intervals are quite disparate. A primary reason is that the bootstrap distribution is still skewed, affecting the simpler methods and their appeal to the Central Limit Theorem. For example
hist(exc75.boot$t, main='', xlab=expression(1/lambda), prob=T) points(exc75.boot$t0, 0, pch = 19,col="red")
(the solid dot, ??, indicates the MLE). The BCa interval incorporates an acceleration adjustment for skew, and may be preferred here.
Refer to Exercise 7.7. Obtain the jackknife estimates of bias and standard error of $\hat \theta$.
require(bootstrap); attach(scor);#attach data theta = function(x){ eigen(cov(x))$values[1]/sum(eigen(cov(x))$values) } #end function n=length(scor[,1]); x=as.matrix(scor); lambda.hat = eigen(cov(x))$values theta.hat = lambda.hat[1]/sum(lambda.hat) theta.jack=numeric(n) for (i in 1:n) {theta.jack[i]=theta( x[-i,] ) }#jackknife bias.jack=(n-1)*(mean(theta.jack)-theta.hat) theta.bar=mean(theta.jack) se.jack = sqrt((n-1)*mean( (theta.jack-theta.bar)^2 )) print( list(theta.hat=theta(scor), bias=bias.jack, se=se.jack) ) detach(package:bootstrap)
we see $\hat{bias}{jack}(\hat\theta)$ =r bias.jack,and $\hat{se}{jack}(\hat\theta)$=r se.jack
In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Use leave-two-out cross validation to compare the models.
library(DAAG); attach(ironslag); n <- length(magnetic) #in DAAG ironslag e1 <- e2 <- e3 <- e4 <- 0; # for n/2-fold cross validation # fit models on leave-two-out samples for (k in 1:(n-1)) { a<-c(k,(k+1)) y <- magnetic[-a] x <- chemical[-a] J1 <- lm(y ~ x) ee1 = magnetic[a]- predict( J1, newdata=data.frame(x=chemical[a]) ) e1[k]<-sum(ee1^2) J2 <- lm(y ~ x + I(x^2)) ee2 = magnetic[a]- predict( J2, newdata=data.frame(x=chemical[a]) ) e2[k]<-sum(ee2^2) J3 <- lm(log(y) ~ x) ee3 = magnetic[a]- exp(predict( J3, newdata=data.frame(x=chemical[a]))) e3[k]<-sum(ee3^2) J4 <- lm(log(y) ~ log(x)) ee4 = magnetic[a]- exp(predict( J4, newdata=data.frame(x=chemical[a]))) e4[k]<-sum(ee4^2) } print( list(mse1=mean(e1/2),mse2=mean(e2/2),mse3=mean(e3/2),mse4=mean(e4/2)) )
We can see Model 2 has the least prediction error 17.96998.
So according to the prediction error criterion, Model 2, the quadratic model,would be the best fit for the data.
title: "Homwork-2018-11-09" author: "A18003" date: "2018/11/09" output: html_document
knitr::opts_chunk$set(echo = TRUE)
Implement the two-sample Cram??er-von Mises test for equal distributions as a permutation test. Apply the test to the data in Examples 8.1 and 8.2 Cram??er-von Mises where the Cram??er-von Mises distance is defined by $$W_2=\frac{mn}{(m+n)^2}[\sum^n_{i=1}(F_n(x_i)-G_m(x_i))^2+\sum^m_{j=1}(F_n(y_j)-G_m(y_j))^2]$$
cvm<-function(x,y,N){ reps<-numeric(N);#permutation numbers n<-length(x); m<-length(y); f<-ecdf(x); g<-ecdf(y); z<-c(x,y); t<-numeric(N); t0<-m*n/(m+n)^2*(sum((f(x)-g(x))^2)+sum((f(y)-g(y))^2)); for(i in 1:N){ k<-sample(length(z),size=n,replace = FALSE); x1<-z[k]; y1<-z[-k]; f1<-ecdf(x1); g1<-ecdf(y1); t[i]<-m*n/(m+n)^2*(sum((f1(x)-g1(x))^2)+sum((f1(y)-g1(y))^2)) } return(c(t0,t)); } #example 1 set.seed(2) attach(chickwts) x1 <- sort(as.vector(weight[feed == "soybean"])); y1 <- sort(as.vector(weight[feed == "linseed"])); cvm1<-cvm(x1,y1,999); p1 <- mean(cvm1 >= cvm1[1]); p1 hist(cvm1, main = "Cram??er-von Mises test", freq = FALSE, xlab = "w (p = 0.385)",breaks = "scott"); points(cvm1[1], 0, cex = 1, pch = 16); detach(chickwts)
from the table we can accept x and y have the same distribution.
Design experiments for evaluating the performance of the NN,energy, and ball methods in various situations.
> Unequal variances and equal expectations
> Unequal variances and unequal expectations
> Non-normal distributions: t distribution with 1 df (heavy-tailed distribution), bimodel distribution (mixture of two normal distributions)
> Unbalanced samples (say, 1 case versus 10 controls)
> Note: The parameters should be choosen such that the powers
are distinguishable (say, range from 0.3 to 0.9).
library(RANN) library(energy) library(Ball) library(boot) library(ggplot2) set.seed(123) m <- 50; k<-3; p<-2; n1<- n2 <- 50; R<-999; n <- n1+n2; N = c(n1,n2) Tn <- function(z, ix, sizes,k) { n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2 if(is.vector(z)) z <- data.frame(z,0); z <- z[ix, ]; NN <- nn2(data=z, k=k+1) # what's the first column? block1 <- NN$nn.idx[1:n1,-1] block2 <- NN$nn.idx[(n1+1):n,-1] i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5) (i1 + i2) / (k * n) } eqdist.nn <- function(z,sizes,k){ boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k) ts <- c(boot.obj$t0,boot.obj$t) p.value <- mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p.values <- matrix(NA,m,3) for(i in 1:m){ x <- matrix(rnorm(n1*p,mean = 1,sd = 0.6),ncol=p); y <- matrix(rnorm(n2*p,mean = 1,sd = 0.8),ncol=p); z <- rbind(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value } alpha <- 0.05; pow <- colMeans(p.values<alpha) print(pow) power <- data.frame(methods = c('NN','energy','Ball'),pow) ggplot(power,aes(methods,pow))+#plot geom_col(fill = 'palegreen3')+ coord_flip()
set.seed(122) m <- 50; k<-3; p<-2; n1<- n2 <- 50; R<-999; n <- n1+n2; N = c(n1,n2) Tn <- function(z, ix, sizes,k) { n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2 if(is.vector(z)) z <- data.frame(z,0); z <- z[ix, ]; NN <- nn2(data=z, k=k+1) # what's the first column? block1 <- NN$nn.idx[1:n1,-1] block2 <- NN$nn.idx[(n1+1):n,-1] i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5) (i1 + i2) / (k * n) } eqdist.nn <- function(z,sizes,k){ boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k) ts <- c(boot.obj$t0,boot.obj$t) p.value <- mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p.values <- matrix(NA,m,3) for(i in 1:m){ x <- matrix(rnorm(n1*p,mean = 0.3,sd = 0.6),ncol=p); y <- matrix(rnorm(n2*p,mean = 0.4,sd = 0.8),ncol=p); z <- rbind(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value } alpha <- 0.05; pow <- colMeans(p.values<alpha) print(pow) power <- data.frame(methods = c('NN','energy','Ball'),pow) ggplot(power,aes(methods,pow))+#plot geom_col(fill = 'palegreen3')+ coord_flip()
set.seed(124) m <- 50; k<-3; p<-2; n1 <- n2 <- 20; R<-999; n <- n1+n2; N = c(n1,n2) Tn <- function(z, ix, sizes,k) { n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2 if(is.vector(z)) z <- data.frame(z,0); z <- z[ix, ]; NN <- nn2(data=z, k=k+1) # what's the first column? block1 <- NN$nn.idx[1:n1,-1] block2 <- NN$nn.idx[(n1+1):n,-1] i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5) (i1 + i2) / (k * n) } eqdist.nn <- function(z,sizes,k){ boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k) ts <- c(boot.obj$t0,boot.obj$t) p.value <- mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p.values <- matrix(NA,m,3) for(i in 1:m){ # t distribution with 1 df (heavy-tailed distribution) x <- matrix(rt(n1*p,df = 1),ncol=p); #bimodel distribution (mixture of two normal distributions) y <- cbind(rnorm(n2,mean = 0.4),rnorm(n2,mean = 0.5)); z <- rbind(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value } alpha <- 0.05; pow <- colMeans(p.values<alpha) print(pow) power <- data.frame(methods = c('NN','energy','Ball'),pow) ggplot(power,aes(methods,pow))+#plot geom_col(fill = 'palegreen3')+ coord_flip()
set.seed(125) m <- 50; k<-3; p<-2; n1 <- 10;n2 <- 100;R<-999; n <- n1+n2; N = c(n1,n2) Tn <- function(z, ix, sizes,k) { n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2 if(is.vector(z)) z <- data.frame(z,0); z <- z[ix, ]; NN <- nn2(data=z, k=k+1) # what's the first column? block1 <- NN$nn.idx[1:n1,-1] block2 <- NN$nn.idx[(n1+1):n,-1] i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5) (i1 + i2) / (k * n) } eqdist.nn <- function(z,sizes,k){ boot.obj <- boot(data=z,statistic=Tn,R=R,sim = "permutation", sizes = sizes,k=k) ts <- c(boot.obj$t0,boot.obj$t) p.value <- mean(ts>=ts[1]) list(statistic=ts[1],p.value=p.value) } p.values <- matrix(NA,m,3) for(i in 1:m){ x <- c(rnorm(n1,mean = 1,sd = 1)); # n1 = 10 y <- c(rnorm(n2,mean = 2,sd = 2)); # n2 = 100 z <- c(x,y) p.values[i,1] <- eqdist.nn(z,N,k)$p.value p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value p.values[i,3] <- bd.test(x=x,y=y,R=999,seed = i*12)$p.value } alpha <- 0.05; pow <- colMeans(p.values<alpha) print(pow) power <- data.frame(methods = c('NN','energy','Ball'),pow) ggplot(power,aes(methods,pow))+#plot geom_col(fill = 'palegreen3')+ coord_flip()
Use the Metropolis-Hastings sampler to generate random variables from a standard Cauchy distribution. Discard the first 1000 of the chain, and compare the deciles of the generated observations with the deciles of the standard Cauchy distribution (see qcauchy or qt with df=1). Recall that a Cauchy(??, ??) distribution has density function $$f(x)=\frac{1}{\theta\pi(1+[(x-\eta)/\theta]^2)}$$ The standard Cauchy has the Cauchy(?? = 1, ?? = 0) density. (Note that the standard Cauchy density is equal to the Student t density with one degree of freedom.)
f903 <- function(x, eta=0, theta=1) { stopifnot( theta > 0 ) return( 1 / (pi*theta * (1 + ((x-eta)/theta)^2)) ) } #end function set.seed(903) m <- 10000 sigma = 1 #try sigma = 1 for proposal scale x <- numeric(m) x[1] <- rnorm(1,0,sigma) #initialize with X0~N(0,sigma^2) k <- 0 u <- runif(m) for (i in 2:m) { xt <- x[i-1] y <- rnorm(1, mean = xt, sd = sigma) num <- f903(y) * dnorm(xt, mean = y, sd = sigma) den <- f903(xt) * dnorm(y, mean = xt, sd = sigma) if (u[i] <= num/den) x[i] <- y else { x[i] <- xt k <- k+1 #y is rejected } #end if/else } #end for loop print(k/m) #rejection rate
(One could just use dcauchy(x) instead of the constructed function f903(x) for the
Cauchy p.d.f.) The rejection rate is given asr k/m
which is rather low (i.e., higher-than-desired acceptance). Setting the burn-in to bo =
1000 draws, the retained chain is found via
b0 = 1000 #burn-in index = (b0+1):m y1 = x[index] #chain after burn-in
To compare deciles with the theoretical Cauchy(0,1) p.d.f., sample R code is
p10 = seq(.1,.9, .1) round( rbind( quantile(y1, p10), qcauchy(p10) ), 3 )
which produces not-unreasonable comparisons, but with some discrepancies in the tails: The trace plot, from
plot( y1~index, type="l", main="", ylab="x" )
Interestingly, a histogram of the retained chain (using b0 = 1000), with the target Cauchy(0,1) p.d.f., from the R code
hist( y1, prob=T, main='', xlab='x', ylim=c(0,.35), breaks=50 ) xarg = seq( min(y1), max(y1), 0.1 ) lines( xarg, f903(xarg), ylim=c(0,.35) )
Moving to, say, m = 50,000 draws leads to slight, but not satiating improvement (try it!). Living up to its reputation, the Cauchy is a difficult distribution with which to operate.
Rao [220, Sec. 5g] presented an example on genetic linkage of 197 animals in four categories (also discussed in [67, 106, 171, 266]). The group sizes are278 Statistical Computing with R (125, 18, 20, 34). Assume that the probabilities of the corresponding multinomial distribution are $$(\frac{1}{2}+\frac{\theta}{4},\frac{1-\theta}{4},\frac{1-\theta}{4},\frac{\theta}{4})$$ Estimate the posterior distribution of ?? given the observed sample, using one of the methods in this chapter.
Set the prior on $\theta$??? to ??? ~ U(0,1) so that $\pi(\theta???) = I_{(0,1)}(\theta)$. The multinomial likelihood is$f(X|\theta???)\propto \prod ip_i(\theta)^{x_i}/x_i!$, where $p_1(\theta)=(2+\theta)/4,p_2(\theta???)=p_3(\theta)=(1-\theta)/4, and\,\,\, p_4(\theta???) =\theta ???/4$.
Combining these via Bayes?? Rule produces the posterior
$$f (\theta|X)=\pi(\theta)f(X|\theta)/f(X)\propto(2+\theta)^{x_1}(1-\theta)^{x_2+x_3}\theta^{x_4}$$
Notice that posterior specification to proportionality is all that is needed here, since any constants in$f(\theta???|X)$ will cancel in the construction of $r(\theta_t,Y)$ in the McMC sampler.
For the proposal density, we usually desire a p.d.f. $g(y|\theta)$ defined over a space the same as that for ???; here this is 0 < y < 1. The simple choice of$ g(y|\theta)=I{(0,1)}(y)$, i.e., a uniform proposal independent of ???, corresponds to an independence sampler. However,independence samplers are not typically recommended. Instead, try a beta p.d.f.proposal with expected value equal to ???t. Different possibilities exist; one is
$Y|\theta \sim\beta(\theta_t/{1-\theta_t},1)$, where$E[Y|\theta_t]=[\theta_???t/{1?C\theta_t}]/[1+(\theta_???t/{1?C\theta_???t})] =\theta_t (For 0 < \theta <1)$, this is a right-skewed, decreasing p.d.f.) Sample R code for an McMC sampler from$ f(\theta???|X)$
f906 <- function( th,x ) { if (th<0 || th>1 ) return (0) (2+th)^x[1] * (1-th)^(x[2]+x[3]) * th^x[4] } #end function xdata = c( 125, 18, 20, 34 ) #observed multinom. data m = 10000 set.seed(906) th = numeric(m) th[1] = runif(1) #initialize: sample from prior on theta k = 0 u = runif(m) ##employ skewed beta proposal density for (t in 2:m) { xt = th[t-1] alph = xt/(1-xt) y <- rbeta(1, shape1=alph, shape2=1 ) numer = f906( y,xdata ) * dbeta( xt, y/(1-y), 1) denom = f906( xt,xdata ) * dbeta( y, alph, 1) if ( u[t] <= numer/denom ) th[t] = y else { th[t] = th[t-1] k = k + 1 } #end if/else } #end for loop
Using this code, the posterior sample of 10,000 points resides in the vector th; its raw trace plot is found via
plot( th, type="l", ylim=range(th), xlab=bquote(theta), ylab="posterior" )
This produces
which appears somewhat variable, but not otherwise unacceptable. A burn-in of perhaps
bo = 2000 will likely suffice. The consequent histogram, found via
hist( th[2001:m], prob=T, breaks="Scott", xlab=bquote(theta), ylab='posterior', main="" )
shows a generally unimodal, slightly left-skewed sample.
The associated posterior mean (a Bayesian estimate) of ???, and the corresponding
posterior ??cell?? probabilities are found via
theta.hat = mean( th[2001:m] ) theta.hat
and
print(c( 0.5 + theta.hat/4, (1 - theta.hat)/4, (1 - theta.hat)/4, theta.hat/4) ) #p.hat vector
respectively.
title: "Homework-2018-11-23" author: "A18003" date: "2018/11/23" output: html_document
Find the intersection points A(k) in $(0, \sqrt{k})$ of the curves $$S_{k-1}(a)=P(t(k-1)>\sqrt{\frac{a^2(k-1)}{k-a^2}})$$ and$$S_{k}(a)=P(t(k)>\sqrt{\frac{a^2k}{k+1-a^2}})$$ for k = 4 : 25, 100, 500, 1000, where t(k) is a Student t random variable with k degrees of freedom.
The upper-tail t-probabilities are found via the pt() function with the lower.tail=F option. The intersection of the two upper-tail functions $S_{k-1}(a)$and$S_k(a)$ occurs when their difference is zero. A quick plot of selected differences $S_{k-1}(a)-S_k(a)$ shows that the likely solutions occur in the interval$1<a<2$, so use this as the capture interval for application of Brent??s method
k = c( 4:25, 100, 500, 1000 ) object = function( a, df ){ a2 = a^2 arg = sqrt( a2*df/(df + 1 - a2) ) Sk = pt( q=arg, df=df, lower=F) arg = sqrt( a2*(df-1)/(df - a2) ) Skm1 = pt( q=arg, df=df-1, lower=F) return( Sk-Skm1 ) } #end function for ( i in 1:length(k) ) { print( c( k[i], uniroot(object, lower=1, upper=2, df=k[i])$root ) ) } #end for loop
title: "Homework-2018-12-07" author: "A18003" date: "2018/12/07" output: html_document
Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list:
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)
attach(mtcars) formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt ) #1 for loops lf_3<- vector("list", length(formulas)) for (i in seq_along(formulas)){ lf_3[[i]] <- lm(formulas[[i]], data = mtcars) } #2 lapply la_3<-lapply(formulas, function(x) lm(formula = x, data = mtcars))
Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply().Can you do it without an anonymous function?
set.seed(123) bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] }) # for loops lf_4<- vector("list", length(bootstraps)) for (i in seq_along(bootstraps)){ lf_4[[i]] <- lm(mpg ~ disp, data = bootstraps[[i]]) } # lapply without anonymous function la_4<- lapply(bootstraps, lm, formula = mpg ~ disp)
For each model in the previous two exercises, extract R2 using the function below. rsq <- function(mod) summary(mod)$r.squared
rsq <- function(mod) summary(mod)$r.squared #For the models in exercise 3: sapply(la_3, rsq) sapply(lf_3, rsq) #For the models in exercise 4: sapply(la_4,rsq) sapply(lf_4,rsq)
The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial.
set.seed(3) trials <- replicate( 100, t.test(rpois(10, 10), rpois(7, 10)), simplify = FALSE ) # anonymous function: sapply(trials, function(x) x[["p.value"]]) # without anonymous function: sapply(trials, "[[", "p.value")
title: "Homework-18003-2018/12/14" author: "A18003" date: "2018/12/14" output: html_document
Make a faster version of chisq.test() that only computes the chi-square test statistic when the input is two numeric vectors with no missing values. You can try simplifying chisq.test() or by coding from the mathematical definition
options(warn = -1) chisq.test1<- function(x, y){ #?ж?????ֵ?Ƿ????? if (!is.numeric(x)) { stop("x must be numeric")} if (!is.numeric(y)) { stop("y must be numeric")} if (length(x) != length(y)) { stop("x and y must have the same length")} if (length(x) <= 1) { stop("length of x must be greater one")} if (any(c(x, y) < 0)) { stop("all entries of x and y must be greater or equal zero")} if (sum(complete.cases(x, y)) != length(x)) { stop("there must be no missing values in x and y")} if (any(is.null(c(x, y)))) { stop("entries of x and y must not be NULL")} #???? m <- rbind(x, y) margin1 <- rowSums(m) margin2 <- colSums(m) n <- sum(m) me <- tcrossprod(margin1, margin2) / n x_stat = sum((m - me)^2 / me)#chisqͳ??�� dof <- (length(margin1) - 1) * (length(margin2) - 1)#???ɶ? p <- pchisq(x_stat, df = dof, lower.tail = FALSE)#pֵ return(list(x_stat = x_stat, df = dof, `p-value` = p)) } #check a=31:35; b=c(31,33,35,37,39); m<-cbind(a,b) chisq.test1(a,b) chisq.test(m) microbenchmark::microbenchmark( chisq.test(m), chisq.test1(a, b))
Can you make a faster version of table() for the case of an input of two integer vectors with no missing values? Can you use it to speed up your chi-square test?
table2 <- function(x, y) { x_val <- unique(x) y_val <- unique(y) mat <- matrix(0L, length(x_val), length(y_val)) for (i in seq_along(x)) { mat[which(x_val == x[[i]]), which(y_val == y[[i]])] <- mat[which(x_val == x[[i]]), which(y_val == y[[i]])] + 1L } dimnames <- list(x_val, y_val) names(dimnames) <- as.character(as.list(match.call())[-1]) # R has names for dimnames... :/ tab <- array(mat, dim = dim(mat), dimnames = dimnames) class(tab) <- "table" tab } a <- c(1, 2, 3, 1, 2, 3) b <- c(2, 3, 4, 2, 3, 4) identical(table(a, b), table2(a, b)) microbenchmark::microbenchmark(table(a, b), table2(a, b)) c<-c(1,2,3,4,5) identical(table(c, c), table2(c, c)) microbenchmark::microbenchmark(table(c, c), table2(c, c))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.