SC19077 is a simple R package developed to give the estimations of parameters of the mixture of two normal distributions by two methods which are MLE and EM algorithms.
mixnormal2 <- function(x,initial,data){ LL <- function(params, data=data) { t1 <- dnorm(data, params[2], params[3]) t2 <- dnorm(data, params[4], params[5]) f <- params[1] * t1 + (1 - params[1]) * t2 ll <- sum(log(f)) return(-ll) } res <- nlminb(initial,LL, data=data, lower=c(.00001,-Inf,.00001,-Inf,.00001), upper=c(0.99999,Inf,Inf,Inf,Inf)) y <- res$par[1]*dnorm(x,res$par[2],res$par[3])+(1-res$par[1])*dnorm(x,res$par[4],res$par[5]) return(y) }
LL is the log-likelihood function of the data. Then we use function $nlminb()$ to get the estimates of parameters. The function $nlminb()$ has three inputs, which are $initial$ indicating the initial values of the parameters, $LL$ indicating the log-likelihood function and $data$ indicating the data we are going to use for MLE. Since $nlminb()$ is to minize $LL$, we set $LL$ as the opposite of log-likelihood.
An example is showed below:
n <- 200 u <- runif(n) x <- ifelse(u<.7,rnorm(n),rnorm(n,3,9)) mixnormal2(0.8,c(.65,-0.1,1.59,3.21,7.99),x)
gmm <- function(p,mu,sig,data){ n <- length(data) dat <- matrix(rep(data,2),ncol=2,byrow=F) prob <- matrix(0,nrow=n,ncol=2) while(TRUE){ for(i in 1:2) prob[,i] <- as.matrix(sapply(dat[,i],dnorm,mu[i],sig[i])) pmatrix <- matrix(rep(p,n),ncol=2,byrow=T) a <- pmatrix*prob b <- matrix(rep(rowSums(a),2),ncol=2) w <- a/b oldp <- p oldmu <- mu oldsig <- sig p <- colSums(w)/n mu <- colSums(dat*w)/colSums(w) d <- matrix(rep(mu,n),ncol=2,byrow=T) sig <- colSums(w*(dat-d)^2)/colSums(w) if((sum((p-oldp)^2) < 1e-4) & (sum((mu-oldmu)^2) < 1e-4) & (sum((sig-oldsig)^2) < 1e-4) )break } return(list(p=p,mu=mu,sig=sig)) }
Actually, there are sealed function to implement EM algorithms. But here we review the EM algorithm through this funciton $gmm()$. In E-step we calculate the probabilities of each sample generated from each normal distribution. In M-step we generate new values of parameters which assure the log-likelihood function is growing bigger. Replace the old values of parameters by new one until the residuals between old values and new values are smaller than the setting threshold.
An example is showed below:
n <- 200 u <- runif(n) x <- ifelse(u<.7,rnorm(n),rnorm(n,3,9)) gmm(c(0.65,0.35),c(-0.1,3.21),c(1.59,7.99),x)
Q:Why do you take statistical computing class?
A:I hope to excell in programming with R and contribute my own packages on the Internet although my programming skill is somehow weak now.
Q:What do you think of statistical computing?
A:Statistical computing is interesting! The tools that teacher talked about in the class are so advanced and convinient that I can not wait using them!
Q:How do you improve your programming skill?
A:I prepare to practice programming everyday and read codes online.
library(knitr) num <- 1:5 Mon <- c(' ',' ','English',' ','Politcs') Tues <- c(' ',' ','Probability',' ',' ') Wed <- c('Linear Model',' ',' ',' ',' ') Thur <- c('Probability',' ',' ',' ',' ') Fri <- c(' ','Computing','Linear Model',' ',' ') schedule <- data.frame(num,Mon,Tues,Wed,Thur,Fri) kable(schedule)
library(MASS) attach(iris) train <- sample.int(150,100) fit <- lda(Species~.,iris[train,]) plot(fit,main="QDA for iris Dataset")
sigma <- 1:4 f1 <- function(x){ x/(sigma[1]^2)*exp(-x^2/(2*sigma[1]^2))} f2 <- function(x){ x/(sigma[2]^2)*exp(-x^2/(2*sigma[2]^2))} f3 <- function(x){ x/(sigma[3]^2)*exp(-x^2/(2*sigma[3]^2))} f4 <- function(x){ x/(sigma[4]^2)*exp(-x^2/(2*sigma[4]^2))} t <- seq(0,10,0.01) curve(f1,0,10) curve(f2,0,10) curve(f3,0,10) curve(f4,0,10)
set.seed(1) n <- 1e2 rn <- matrix(rep(0,n*4),ncol=4) i <- c(0,0,0,0) j <- c(0,0,0,0) for(k in 1:4){ while(i[k]<n){ u <- runif(1) j[k] <- j[k]+1 x <- rgamma(1,shape=2,scale=2*sigma[k]^2) if(u <= exp((x-x^2)/(2*sigma[k]^2))){ i[k] <- i[k]+1 rn[i[k],k] <- x } } } j i
hist(rn[,1],prob=T,xlab='x',ylab='f(x)',main=expression(f(x)==x/sigma[1]^2*e^(-x^2/2*sigma[1]^2))) lines(t,f1(t)) hist(rn[,2],prob=T,xlab='x',ylab='f(x)',main=expression(f(x)==x/sigma[2]^2*e^(-x^2/2*sigma[2]^2))) lines(t,f2(t)) hist(rn[,3],prob=T,xlab='x',ylab='f(x)',main=expression(f(x)==x/sigma[3]^2*e^(-x^2/2*sigma[3]^2))) lines(t,f3(t)) hist(rn[,4],prob=T,xlab='x',ylab='f(x)',main=expression(f(x)==x/sigma[4]^2*e^(-x^2/2*sigma[4]^2))) lines(t,f4(t))
The histograms above show that the mode of the 4 groups of generated samples are all close to the corresponding theoretical mode.
Generate a random sample of size 1000 from a normal location mixture. The components of the mixture have N(0, 1) and N(3, 1) distributions with mixing probabilities p1 and p2 = 1 − p1. Graph the histogram of the sample with density superimposed, for p1 = 0.75. Repeat with different values for p1 and observe whether the empirical distribution of the mixture appears to be bimodal. Make a conjecture about the values of p1 that produce bimodal mixtures.
First,generate random sample from the specified normal location mixture with $p_1=0.75$.Then graph the histogram of the sample.
set.seed(2) n <- 1e3 u <- runif(n) rn <- ifelse(u<.75,rnorm(n),rnorm(n,3,1)) t <- seq(-6,6,.01) hist(rn,prob=T,xlim=c(-6,6),ylim=c(0,0.4),main='Normal Location Mixtrue of N(0,1) and N(3,1) for p1=0.75') lines(t,0.75*dnorm(t,0,1)+0.25*dnorm(t,3,1))
Graph the histograms of samples for $19$ different $p_1$ which ranges from $0.05$ to $0.95$.
p1 <- seq(.05,.95,.05) mix <- matrix(rep(0,n*19),ncol=19) for(i in 1:19){ u <- runif(n) mix[,i] <- ifelse(u<p1[i],rnorm(n),rnorm(n,3,1)) } x <- seq(-4,8,.01) for(i in 1:19){ p <- 0.05*i hist(mix[,i],prob=T,xlim=c(-4,8),ylim=c(0,0.5),main='') legend(5,0.5,title="p1",p,xjust=0,box.lty = 0) lines(x,p1[i]*dnorm(x,0,1)+(1-p1[i])*dnorm(x,3,1)) }
rmvn <- function(d,n,Sigma){ T <- matrix(rep(0,d*d),nrow=d) for(i in 1:d){ x <- rchisq(1,n-i+1) T[i,i] <- sqrt(x) } for(i in 2:d){ for(j in 1:(i-1) ){ T[i,j] <- rnorm(1) } } A <- T %*% t(T) ev <- eigen(Sigma) lambda <- ev$values v <- ev$vectors L <- v %*% diag(sqrt(lambda)) %*% t(v) rn <- L %*% A %*% t(L) }
Let's do simulation.
n <- 5 d <- 3 B <- matrix(c(1,1,1,1,2,4,1,3,9),nrow=3) Sigma <- (B/10) %*% t(B/10) Sigma r <- rmvn(d,n,Sigma) r
The random sample $r$ above is generated from $W_3(\sum ,5)$,where $\sum =$ $$ \begin{bmatrix} 0.03&0.06&0.14\ 0.06&0.14&0.36\ 0.14&0.36&0.98 \end{bmatrix} $$
real <- cos(0) - cos(pi/3) real set.seed(1) n <- 1e4 x <- runif(n,0,pi/3) theta <- round(mean(pi/3*sin(x)),5) theta <- real
set.seed(4) g <- function(x){pi/3*sin(x)} f <-function(x){pi/3*x} n <- 1e4 r <- 1e3 u <- runif(r,0,pi/3) c <- -cov(f(u),g(u))/var(f(u)) c MC1 <- MC2 <- numeric(n) for (i in 1:n){ v <- runif(r,0,pi/3) MC1[i] <- mean(g(v)) MC2[i] <- mean(g(v)+c*(f(v)-pi/3*pi/6)) } theta1 <- mean(MC1) theta2 <- mean(MC2) theta1 theta2 (var(MC1)-var(MC2))/var(MC1)
set.seed(2) g <- function(x){exp(-x)/(1+x^2)} n <- 1e3 r <- 1e4 MC1 <- MC2 <- numeric(n) for (i in 1:n){ u <- runif(r/2) v <- runif(r/2) MC1[i] <- round(mean(g(c(u,v))),5) MC2[i] <- round(mean(g(c(u,1-u))),5) } alpha1 <- round(mean(MC1),5) alpha2 <- round(mean(MC2),5) alpha1 alpha2 (var(MC1)-var(MC2))/var(MC1)
set.seed(3) k <- 5 Finvers <- function(x){-log(1-(1-exp(-1))*x)} a <- c(rep(0,5),1) for (i in 2:5){ a[i] <- Finvers((i-1)/5) } a
We get 5 subintervals:$[0,0.135),[0.135,0.291),[0.291,0.477),[0.477,0.705),[0.705,1]$.
n <- 100 r <- 100 g <- function(x){exp(-x)/(1+x^2) * (x>0) * (x<1)} f <- function(x){exp(-x)/(1-exp(-1))} p <- function(x){k*f(x)} Ginvers <- function(a,u){-log(exp(-a)-(1-exp(-1))/5*u)} theta <- numeric(n) b <- numeric(k) for (i in 1:n){ for (j in 1:k){ u <- runif(r/k,a[j],a[j+1]) x <- Ginvers(a[j],u) b[j] <- mean( g(x)/p(x) * (x>a[j]) * (x<a[j+1]) ) } theta[i] <- sum(b) } round(mean(theta),5) se <- 0.0970314 (se^2 - var(theta))/se^2
set.seed(1) m <- 1e4 n <- 20 alpha <- .05 UCL <- LCL <- numeric(m) for (i in 1:m){ x <- rchisq(n,2) UCL[i]=mean(x)+var(x)/sqrt(n)*qt(alpha/2,df=n-1,lower.tail = F) LCL[i]=mean(x)-var(x)/sqrt(n)*qt(alpha/2,df=n-1,lower.tail = F) } r <- mean(LCL < 2 & 2 < UCL) r abs(r - (1-alpha)) - abs(0.956 - (1-alpha)) se <- sqrt(r*(1-r)/m) se (0.00689 - se)/0.00689
set.seed(2) m <- 100 a <- 1e3 n <- 50 q <- c(.025,.05,.95,.975) xq <- qnorm(q,0,sqrt(6/n)) sk <- function(x){ xbar <- mean(x) m3 <- mean((x-xbar)^3) m2 <- mean((x-xbar)^2) return(m3/m2^1.5) } b <- numeric(a) for(j in 1:a){ x <- rnorm(n) b[j] <- sk(x) } M <- quantile(b,q) for (i in 1:(m-1)){ for(j in 1:a){ x <- rnorm(n) b[j] <- sk(x) } r <- quantile(b,q) M <- rbind(M,r) } xq.est <- apply(M,2,mean) se.xq.est <- apply(M,2,sd) t <- dnorm(xq,0,sqrt(6*(n-2)/((n+1)*(n+3)))) se <- sqrt( q*(1-q)/n/t^2 ) knitr::kable(rbind(xq,xq.est), formate = "html",col.names = c(".025",".05","0.95","0.975")) knitr::kable(rbind(se,se.xq.est), formate = "html",col.names = c(".025",".05","0.95","0.975"))
set.seed(1) alpha <- .05 a <- 1 b <- 10 dis <- seq(0,1,.05) m <- 1e4 n <- 30 cv <- qnorm(1-alpha/2,0,sqrt(6*(n-2)/(n+1)/(n+3))) pwr1 <- numeric(length(dis)) sk <- function(x){ xbar <- mean(x) return(mean((x-xbar)^3)/(mean((x-xbar)^2))^1.5) } for(i in 1:length(dis)){ d <- dis[i] sktest <- numeric(m) for(j in 1:m){ u <- runif(n) x <- ifelse(u<d, rbeta(n,a,a),rbeta(n,b,b)) sktest[j] <- as.integer(abs(sk(x)) >= cv) } pwr1[i] <- mean(sktest) } plot(dis,pwr1,type="b",xlab="d",ylab="power",main="Power of skewness test against Beta(a,a)") abline(h=alpha,lty = 3)
In order to calculate power of the skewness test against symmetric Beta distribution, we get different samples from $H_1$ by setting $d$ ranging from 0 to 1.
The figure above shows that the power rises firstly and then goes down as $d$ increases and reaches the peak when $d$ is 0.1 or so. Morever,the power is bigger than $\alpha$ when $d < 0.4$ and smaller than $\alpha$ when $d > 0.4$. In general, the power of skewness test of normality against symmetric Beta distribution is no bigger than 0.4 which means the skewness test of normality against $Beta(\alpha,\alpha)$ isn't very good.
(2) The skewness test of normality with sampling distribution $t(v)$
We sample from $H_1$ by choosing a contaminated t distribution $d t(v_1) + (1-d)t(v_2)$ where $v_1\neq v_2$.
set.seed(2) alpha <- .05 t1 <- 2 t2 <- 9#parameter of beta distribution dis <- seq(0,1,.05)#probabilities to sample m <- 1e4 n <- 30 cv <- qnorm(1-alpha/2,0,sqrt(6*(n-2)/(n+1)/(n+3))) pwr2 <- numeric(length(dis)) sk <- function(x){ xbar <- mean(x) return(mean((x-xbar)^3)/(mean((x-xbar)^2))^1.5) } for(i in 1:length(dis)){ d <- dis[i] sktest <- numeric(m) for(j in 1:m){ u <- runif(n) x <- ifelse(u<d, rt(n,t1),rt(n,t2)) sktest[j] <- as.integer(abs(sk(x)) >= cv) } pwr2[i] <- mean(sktest) } plot(dis,pwr2,type="b",xlab="d",ylab="power",main="Power of skewness test against t(v)") abline(h=alpha,lty=3)
From the figure above we can see the empirical power of skewness test against t distribution is increasing with the increasement of $d$ and are almost all bigger than those of symmetric Beta distribution. This implies that sampling from t-distribution is better than from symmetric Beta in order to calculate the power of skewness test of normality.
set.seed(3) alpha <- .05 mu0 <- 1 n <- seq(20,500,30) m <- 100 p.hat <- numeric(length(n)) for(i in 1:length(n)){ N <- n[i] ttest <- numeric(m) for(j in 1:m){ x <- rchisq(N,1) ttest[j] <- t.test(x,mu = mu0)$p.value } p.hat[i] <- mean(ttest <= alpha) } se <- sqrt(p.hat*(1-p.hat)/m) plot(n,p.hat,ylab="tIe",type="b",pch=20) abline(h=alpha,lty=3) lines(n, p.hat+se ,lty=3) lines(n,p.hat-se, lty=3)
The figure shows that you'd better choose lage n for t test against $\chi^2(1)$ in order to get a close approximation to $\alpha$. But the empirical type I error of t test against $\chi^2(1)$ is almost all bigger than $\alpha$.
(2) t-test against $U(0,2)$
set.seed(4) alpha <- .05 mu0 <- 1 n <- seq(20,500,30) m <- 100 p.hat <- numeric(length(n)) for(i in 1:length(n)){ N <- n[i] ttest <- numeric(m) for(j in 1:m){ x <- runif(N,0,2) ttest[j] <- t.test(x,mu = mu0)$p.value } p.hat[i] <- mean(ttest <= alpha) } se <- sqrt(p.hat*(1-p.hat)/m) plot(n,p.hat,ylab="tIe",type="b",pch=20) abline(h=alpha,lty=3) lines(n, p.hat+se ,lty=3) lines(n,p.hat-se, lty=3)
set.seed(6) alpha <- .05 mu0 <- 1 n <- seq(20,500,30) m <- 100 p.hat <- numeric(length(n)) for(i in 1:length(n)){ N <- n[i] ttest <- numeric(m) for(j in 1:m){ x <- rexp(N,1) ttest[j] <- t.test(x,mu = mu0)$p.value } p.hat[i] <- mean(ttest <= alpha) } se <- sqrt(p.hat*(1-p.hat)/m) plot(n,p.hat,ylab="tIe",type="b",pch=20) abline(h=alpha,lty=3) lines(n, p.hat+se ,lty=3) lines(n,p.hat-se, lty=3)
The figure shows that you'd better choose $n>100$ for t test against $E(1)$ in order to get a close approximation to $\alpha$. Comparing to (2),the empirical type I error may be smaller than $alpha$ slightly sometimes in (3).
If we obtain the powers for two methods under a particular simulation setting with 10000 experiements:say,0.651 for one method and 0.676 for another.Can we say the powers are different at 0.05 level?
set.seed(7) n <- 30 m <- 100 p1 <- 0.651 p2 <- 0.676 x <- rbinom(n,size=m,prob=p1) y <- rbinom(n,size=m,prob=p2) ttest <- t.test(x,y,conf.level = alpha) p <- ttest$p.value p
From the result of t test, we should reject $H_0$ indicating that we can't say the powers are the same at 0.05 level
title: "A-19077-2019-11-08" author: "By-19077" date: "2019/11/10" output: html_document
knitr::opts_chunk$set(echo = TRUE)
Efron and Tibshirani discuss the scor (bootstrap) test score data on 88 students who took examinations in five subjects [84, Table 7.1], [188, Table 1.2.1]. The first two tests (mechanics, vectors) were closed book and the last three tests (algebra, analysis, statistics) were open book. Each row of the data frame is a set of scores (xi1, . . . , xi5) for the ith student. Use a panel display to display the scatter plots for each pair of test scores. Compare the plot with the sample correlation matrix. Obtain bootstrap estimates of the standard errors for each of the following estimates: ˆ ρ12 = ˆ ρ(mec, vec), ˆ ρ34 = ˆ ρ(alg, ana), ˆ ρ35 = ˆ ρ(alg, sta), ˆ ρ45 = ˆ ρ(ana, sta)
1) panel display
library(bootstrap) data(scor) attach(scor) pairs(scor,labels=c("mec","vec","alg","ana","sta")) round(cor(scor),4)
set.seed(1) library(boot) n <- nrow(scor) r <- 100 f12 <- function(x,i)cor(x[i,1],x[i,2]) f34 <- function(x,i)cor(x[i,3],x[i,4]) f35 <- function(x,i)cor(x[i,3],x[i,5]) f45 <- function(x,i)cor(x[i,4],x[i,5]) cor12 <- boot(data=scor, statistic=f12, R=r) cor34 <- boot(data=scor, statistic=f34, R=r) cor35 <- boot(data=scor, statistic=f35, R=r) cor45 <- boot(data=scor, statistic=f45, R=r) boot.est <- cbind(cor12[1],cor34[1],cor35[1],cor45[1]) boot.est <- round(as.numeric(boot.est),3) y <- cbind(cor12$t,cor34$t,cor35$t,cor45$t) se.hat <- round(apply(y,2,sd),4) f = data.frame(boot.est,se.hat,row.names=c("rho12","rho34","rho35","rho45")) knitr::kable(f)
set.seed(2) n <- 20 m <- 100 sk1 <- 0 library(boot) sk <- function(x) { xbar <- mean(x) m3 <- mean((x - xbar)^3) m2 <- mean((x - xbar)^2) return( m3 / m2^1.5 ) } b <- function(x,i)sk(x[i]) n.ci.norm <- n.ci.basic <- n.ci.perc <- matrix(NA,m,2) n.cover <- n.misleft <- n.misright <- numeric(3) for (i in 1:m){ x <- rnorm(n) n.boot.est <- boot(data=x,statistic=b,R=m) n.boot.ci <- boot.ci(n.boot.est,type=c("norm","basic","perc")) n.ci.norm[i,] <- n.boot.ci$normal[2:3] n.ci.basic[i,] <- n.boot.ci$basic[4:5] n.ci.perc[i,] <- n.boot.ci$percent[4:5] } n.norm.indic <- n.basic.indic <- n.perc.indic <- numeric(m) n.norm.left <- n.norm.right <- numeric(m) n.basic.left <- n.basic.right <- numeric(m) n.perc.left <- n.perc.right <- numeric(m) for (i in 1:m){ n.norm.indic[i] <- as.integer(n.ci.norm[i,1] <= 0 & 0 <= n.ci.norm[i,2]) n.basic.indic[i] <- as.integer(n.ci.basic[i,1] <= 0 & 0 <= n.ci.basic[i,2]) n.perc.indic[i] <- as.integer(n.ci.perc[i,1] <= 0 & 0 <= n.ci.perc[i,2]) n.norm.right[i] <- as.integer(n.ci.norm[i,2] < 0) n.basic.right[i] <- as.integer(n.ci.basic[i,2] < 0) n.perc.right[i] <- as.integer(n.ci.perc[i,2] < 0) n.norm.left[i] <- as.integer(0 < n.ci.norm[i,1] ) n.basic.left[i] <- as.integer(0 < n.ci.basic[i,1]) n.perc.left[i] <- as.integer(0 < n.ci.perc[i,1]) } n.cover <- c(mean(n.norm.indic),mean(n.basic.indic),mean(n.perc.indic)) n.misleft <- c(mean(n.norm.left),mean(n.basic.left),mean(n.perc.left)) n.misright <- c(mean(n.norm.right),mean(n.basic.right),mean(n.perc.right)) n.cover n.misright n.misleft
set.seed(3) n <- 20 m <- 100 f <- function(x){dchisq(x,5)*(x-5)^3/sqrt(10)^3} sk2 <- round(integrate(f,0,Inf)$value,3) sk2 ch.ci.norm <- ch.ci.basic <- ch.ci.perc <- matrix(NA,m,2) ch.cover <- ch.misleft <- ch.misright <- numeric(3) for (i in 1:m){ x <- rchisq(n,5) ch.boot.est <- boot(data=x,statistic=b,R=m) ch.boot.ci <- boot.ci(ch.boot.est,type=c("norm","basic","perc")) ch.ci.norm[i,] <- ch.boot.ci$normal[2:3] ch.ci.basic[i,] <- ch.boot.ci$basic[4:5] ch.ci.perc[i,] <- ch.boot.ci$percent[4:5] } ch.norm.indic <- ch.basic.indic <- ch.perc.indic <- numeric(m) ch.norm.left <- ch.norm.right <- numeric(m) ch.basic.left <- ch.basic.right <- numeric(m) ch.perc.left <- ch.perc.right <- numeric(m) for (i in 1:m){ ch.norm.indic[i] <- as.integer(ch.ci.norm[i,1] <= sk2 & sk2 <= ch.ci.norm[i,2]) ch.basic.indic[i] <- as.integer(ch.ci.basic[i,1] <= sk2 & sk2 <= ch.ci.basic[i,2]) ch.perc.indic[i] <- as.integer(ch.ci.perc[i,1] <= sk2 & sk2 <= ch.ci.perc[i,2]) ch.norm.right[i] <- as.integer(ch.ci.norm[i,2] < sk2) ch.basic.right[i] <- as.integer(ch.ci.basic[i,2] < sk2) ch.perc.right[i] <- as.integer(ch.ci.perc[i,2] < sk2) ch.norm.left[i] <- as.integer(sk2 < ch.ci.norm[i,1] ) ch.basic.left[i] <- as.integer(sk2 < ch.ci.basic[i,1]) ch.perc.left[i] <- as.integer(sk2 < ch.ci.perc[i,1]) } ch.cover <- c(mean(ch.norm.indic),mean(ch.basic.indic),mean(ch.perc.indic)) ch.misleft <- c(mean(ch.norm.left),mean(ch.basic.left),mean(ch.perc.left)) ch.misright <- c(mean(ch.norm.right),mean(ch.basic.right),mean(ch.perc.right)) ch.cover #0.719 0.702 0.71 ch.misright #0.26 0.264 0.29 ch.misleft #0.021 0.034 0
From the above we have the coverage rates of normal distribution and $\chi^2(5)$ are
f <- data.frame(n.cover,ch.cover,row.names=c("Standard","Basic","Percentile")) knitr::kable(f)
The rates of missing on the right and on the left of normal distribution are
f <- data.frame(n.misright,n.misleft,row.names=c("Standard","Basic","Percentile")) knitr::kable(f)
The rates of missing on the right and on the left of $\chi^2(5)$ distribution are
f <- data.frame(ch.misright,ch.misleft,row.names=c("Standard","Basic","Percentile")) knitr::kable(f)
library(bootstrap) n <- nrow(scor) cov1 <- (n-1)/n*cov(scor) value1 <- unlist(eigen(cov1, symmetric=T, only.values=T)) theta.hat <- max(value1)/sum(value1) theta.jack <- numeric(n) for (i in 1:n){ cov2 <- (n-1)/n*cov(scor[-i,]) value2 <- unlist(eigen(cov2, symmetric=T, only.values=T)) theta.jack[i] <- max(value2)/sum(value2) } bias.jack <- (n-1)*(mean(theta.jack) - theta.hat) se.jack <- sqrt( (n-1) * mean( (theta.jack - mean(theta.jack))^2 )) bias.jack <- round(bias.jack,5) se.jack <- round(se.jack,4) print(c(bias = bias.jack,se = se.jack))
In Example 7.18, leave-one-out (n-fold) cross validation was used to select the best fitting model. Repeat the analysis replacing the Log-Log model with a cubic polynomial model. Which of the four models is selected by the cross validation procedure? Which model is selected according to maximum adjusted R2?
library(DAAG) attach(ironslag) modl <- c("Linear","Quadratic","Exponential","CubicPolynomial") which.best <- function(dat){ n <- length(dat[,1]) y1 <- y2 <- y3 <- y4 <- numeric(n) for (i in 1:n){ x <- dat[-i,1] y <- dat[-i,2] x0 <- dat[i,1] cv1 <- lm( y ~ x ) a1 <- cv1$coef y1[i] <- a1[1] + a1[2]*x0 cv2 <- lm( y ~ x + I(x^2)) a2 <- cv2$coef y2[i] <- a2[1] + a2[2]*x0 + a2[3]*x0^2 cv3 <- lm( log(y) ~ x) a3 <- cv3$coef y3[i] <- exp( a3[1] + a3[2]*x0 ) cv4 <- lm( y ~ poly(x,degree=3)) a4 <- cv4$coef y4[i] <- a4[1] + a4[2]*x0 + a4[3]*x0^2 + a4[4]*x0^3 } y <- cbind(y1,y2,y3,y4) Y <- dat[,2] e <- numeric(4) for (i in 1:4)e[i] <- mean((Y-y[,i])^2) SST <- sum((Y - mean(Y))^2) SSR <- numeric(4) for (i in 1:4)SSR[i] <- sum((Y-y[,i])^2) k <- c(1,2,1,3) R2 <- SSR/SST adjR2 <- (1-(1-R2))*(n-1)/(n-k-1) num <- c(which.min(e),which.max(adjR2)) return(c(CrossValidation=modl[num[1]],adjustedR2=modl[num[2]])) } which.best(ironslag)
The empirical Type I error rate is 0.044 which demonstrates that critical value of 6 is better than 5 in this issue with unequal sample sizes.
title: "A-19077-2019-11-29" author: "By 19077" date: "2019/11/29" output: html_document
Implement a random walk Metropolis sampler for generating the standard Laplace distribution $f(x)={1\over2}e^{-|x|}$(see Exercise 3.2). For the increment, simulate from a normal distribution. Compare the chains generated when different variances are used for the proposal distribution. Also, compute the acceptance rates of each chain.
Implement the random walk version of the Metropolis sampler to generate the target standard Laplace distribution, using the proposal distribution Normal($Xt$, $\sigma^2$).
library(GeneralizedHyperbolic) rw.Metropolis <- function(x0,N,sigma){ x <- numeric(N) x[1] <- x0 u <- runif(N) k <- 0 for (i in 2:N){ y <- rnorm(1,x[i-1],sigma) if (u[i] <= dskewlap(y) / dskewlap(x[i-1])) x[i] <- y else{ x[i] <- x[i-1] k <- k+1 } } return(list(x=x,k=k)) } sigma <- c(.05,.1,seq(.2,1,.4),seq(2,6,1),10) n <- length(sigma) N <- 2000 x0 <- rnorm(1) rw <- matrix(nrow=N,ncol=n) Num_of_Reject <- numeric(n) for (i in 1:n){ rw[,i] <- rw.Metropolis(x0,N,sigma[i])$x Num_of_Reject[i] <- rw.Metropolis(x0,N,sigma[i])$k }
The acceptance rate is as follows:
Accept_Rate <- round(1-Num_of_Reject/N,3) knitr::kable(rbind(Num_of_Reject,Accept_Rate),formate="html",col.names=c("0.05","0.1","0.2","0.6","1","2","3","4","5","6","10")) refline <- qskewlap(c(.025,.975)) for (i in 1:n){ plot(1:N,rw[,i],type="l",xlab=bquote(sigma==.(sigma[i])),ylab="X") abline(h=refline) }
From the figures above we can see:
a)the chains didn't converge when $\sigma < 0.6$;
b)when $\sigma > 2$ there are many short horizontal paths in the graph in which the candidate point is rejected and the chain didn't move at these time points.
In conclusion, if we want to implement a random walk Metropolis sampler for generating the standard Laplace distribution with normal proposal distribution, we'd better choose standard deviation with $0.6 \leq \sigma \leq2$.
title: "A-19077-2019-12-06" author: "By 19077" date: "2019/12/10" output: html_document: default
The natural logarithm and exponential functions are inverses of each other, so that mathematically log(exp x) = exp(log x) = x. Show by example that this property does not hold exactly in computer arithmetic. Does the identity hold with near equality? (See all.equal.)
isTRUE(exp(log(.001))==.001) isTRUE(log(exp(.001))==.001) isTRUE(log(exp(.001))==exp(log(.001))) isTRUE(all.equal(log(exp(.001)),.001)) isTRUE(all.equal(exp(log(.001)),.001)) isTRUE(all.equal(exp(log(.001)),log(exp(.001))))
In the example above I choose $x=0.001$. And $e^{ln(0.001)}$ and $ln e^{0.001}$ should be the same in the mathematical calculation but not in the computer arithmetic. Hence the function "isTRUE" returned "FALSE". However, with near equality the identity holds and the function "isTRUE" with "all.equal" returned "TRUE".
Write a function to solve the equation $$\frac{2\Gamma\left(\frac{k}{2}\right)}{\sqrt{\pi(k-1)}\Gamma\left(\frac{k-1}{2}\right)}\int_{0}^{c_{k-1}}\left(1+\frac{u^{2}}{k-1}\right)^{-k/2}du =\frac{2\Gamma\left(\frac{k+1}{2}\right)}{\sqrt{\pi k}\Gamma\left(\frac{k}{2}\right)} \int_{0}^{c_{k}}\left(1+\frac{u^{2}}{k}\right)^{-(k+1)/2}du
$$ for a, where $c_{k}=\sqrt{\frac{a^{2} k}{k+1-a^{2}}}$. Compare the solutions with the points $A(k)$ in Exercise 11.4. $$
#####11.4 K <- c(4:25,100,500,1e3) UP <- sqrt(K) n <- length(K) A <- numeric(n) for (i in 1:n){ k <- K[i] f <- function(a){ pt(sqrt(a^2*(k-1)/(k-a^2)),k-1) - pt(sqrt(a^2*k/(k+1-a^2)),k) } a <- UP[i]/2+1 out <- uniroot(f,c(0.5,a)) A[i] <- out$root }
### 11.5 K1 <- c(4:25,100) n1 <- length(K1) B <- numeric(n1) ck <- function(k,a)sqrt((k*(a^2))/(k+1-a^2)) integ_f1 <- function(u)(1+(u^2)/(k-1))^(-k/2) integ_f2 <- function(u)(1+(u^2)/k)^((-1-k)/2) f <- function(a){ gamma(k/2)/(gamma((k-1)/2)*sqrt(k-1))*integrate(integ_f1,0,ck(k-1,a))$value-gamma((k+1)/2)/(gamma(k/2)*sqrt(k))*integrate(integ_f2,0,ck(k,a))$value } for (i in 1:n1){ k <- K1[i] B[i] <- uniroot(f,lower=0.05,upper=(sqrt(k)/2+1))$root }
A <- round(A[1:n1],4) B <- round(B,4) err <- A-B f <- data.frame(rbind(A,B,err),row.names=c("11.4","11.5","error")) knitr::kable(f)
Let the three alleles be A, B, and O with allele frequencies p, q, and r. The 6 genotype frequencies under HWE and complete counts are as follows.
|Genotype |AA| BB |OO| AO| BO| AB| Sum | |:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:| |Frequency | $p^2$ |$q^2$| $r^2$| 2pr| 2qr| 2pq |1 | |Count| $n_{AA}$ |$n_{BB}$ |$n_{OO}$ |$n_{AO}$| $n_{BO}$| $n_{AB}$| n|
library(rootSolve) r <- 1e3 n1 <- 28 n2 <- 24 n3 <- 41 n4 <- 70 p0 <- .1 q0 <- .1 L <- c(.1,.1) tol <- .Machine$double.eps^0.5 L.old <- L+1 logML <- numeric(r) for(j in 1:r){ logML[j] <- 2*p0*n1*log(p0)/(2-p0-2*q0)+2*q0*n2*log(q0)/(2-q0-2*p0)+2*n3*log(1-p0-q0)+n1*(2-2*p0-2*q0)*log(2*p0*(1-p0-q0))/(2-p0-2*q0)+n2*(2-2*p0-2*q0)*log(2*q0*(1-p0-q0))/(2-q0-2*p0)+n4*log(2*p0*q0) g <- function(x){ F1 <- 2*p0*n1/((2-p0-2*q0)*x[1])-2*n3/(1-x[1]-x[2])+n1*(2-2*p0-2*q0)*(1-2*x[1]-x[2])/((2-p0-2*q0)*x[1]*(1-x[1]-x[2]))-n2*(2-2*p0-2*q0)/((2-q0-2*p0)*(1-x[1]-x[2]))+n4/x[1] F2 <- 2*q0*n2/((2-q0-2*p0)*x[2])-2*n3/(1-x[1]-x[2])-n1*(2-2*p0-2*q0)/((2-p0-2*q0)*(1-x[1]-x[2]))+n2*(2-2*p0-2*q0)*(1-2*x[2]-x[1])/((2-q0-2*p0)*x[2]*(1-x[1]-x[2]))+n4/x[2] c(F1=F1,F2=F2) } ss=multiroot(f=g,star=c(.1,.1)) L=ss$root if (sum(abs(L-L.old)/L.old)<tol) break L.old <- L } L.old plot(logML,type = "l")
title: "A-19077-2019-12-13" author: "By 19077" date: "2019/12/13" output: html_document
knitr::opts_chunk$set(echo = TRUE)
Use both for loops and lapply() to fit linear models to the mtcars using the formulas stored in this list: formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt )
attach(mtcars) formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt ) #lapply ##version 1 lm_result1 <- lapply(formulas,function(x)lm(x)) ##version 2 lm_result1 <- lapply(formulas,lm,data=mtcars) #for lm_result2 <- vector("list",length(formulas)) for (i in seq_along(formulas))lm_result2[[i]] <- lm(formulas[[i]]) #coefficients l_int <- round(sapply(lm_result1,function(x)x$coefficients[1]),3) l_coef <- round(sapply(lm_result1,function(x)x$coefficients[2]),3) f_int <- round(sapply(lm_result2,function(x)x$coefficients[1]),3) f_coef <- round(sapply(lm_result2,function(x)x$coefficients[2]),3)
Fit the model mpg ~ disp to each of the bootstrap replicates of mtcars in the list below by using a for loop and lapply(). Can you do it without an anonymous function? bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] })
attach(mtcars) bootstraps <- lapply(1:10, function(i) { rows <- sample(1:nrow(mtcars), rep = TRUE) mtcars[rows, ] }) #for results1 <- vector("list",length(bootstraps)) for(i in seq_along(bootstraps)){ mt <- bootstraps[[i]] results1[[i]] <- lm(mt[,1]~mt[,3]) } #lapply results2 <- lapply(bootstraps,function(x)lm(x[,1]~x[,3])) #coefficients f_intercpt <- f_coef <- numeric(length(results1)) for(i in seq_along(bootstraps)){ f_intercpt[i] <- results1[[i]]$coefficients[1] f_coef[i] <- results1[[i]]$coefficients[2] } round(f_intercpt,3) round(f_coef,3)
#without anonymous function results4 <- lapply(bootstraps,lm,formula=mpg~disp)
For each model in the previous two exercises, extract R2 using the function below. rsq <- function(mod) summary(mod)$r.squared
rsq <- function(mod) summary(mod)$r.squared #T3 t3 <- sapply(lm_result1,rsq) round(t3,3)
#T4 t4 <- sapply(results1,rsq) round(t4,3)
The following code simulates the performance of a t-test for non-normal data. Use sapply() and an anonymous function to extract the p-value from every trial. trials <- replicate(100,t.test(rpois(10, 10), rpois(7, 10)),simplify = FALSE) Extra challenge: get rid of the anonymous function by using [[ directly.
trials <- replicate(100,t.test(rpois(10, 10), rpois(7, 10)),simplify = FALSE) p_result1 <- sapply(trials,function(x)x$p.value) round(p_result1[1:10],3)
p_result2 <- sapply(trials,"[[","p.value") round(p_result2[1:10],3)
Implement mcsapply(), a multicore version of sapply(). Can you implement mcvapply(), a parallel version of vapply()? Why or why not?
library(parallel) cores <- detectCores(logical = FALSE) cl <- makeCluster(getOption("cl.cores", cores)) parSapply(cl, 1:500, get("+"), 3)[1:10]
As for whether we can implement a parallel version of vapply(), my answer is NO. The condition for parallelisation is that each iteration is isolated from all others(See P212,Advanced R),that is when we scrambl the order of elements of X and iterate in sequence, the answer is always the same.
To make this clear, we can see the explanation code of lapply() in Advanced R:
lapply2 <- function(x, f, ...) { out <- vector("list", length(x)) for (i in seq_along(x))out[[i]] <- f(x[[i]], ...) out }
When we scramble the order of computation, the explanation code of apply() becomes:(caution:the order of computation changes because of function "sample")
lapply3 <- function(x, f, ...) { out <- vector("list", length(x)) for (i in sample(seq_along(x)))out[[i]] <- f(x[[i]], ...) out }
However, the result is the same as before so lapply() has its parallel mclapply(). So is function slapply().
But for vapply(), the example below shows that when the order of computation changes, the result of vapply() may be changed.
x <- 1:10 y <- rep("a",10) df1 <- data.frame(x,y) df2 <- data.frame(y,x) vapply(df1,class,FUN.VALUE=character(1)) vapply(df2,class,FUN.VALUE=character(1))
The result changes so vapply() can't have its parallelisation mcvapply().
title: "A-19077-2019-12-20" author: "By 19077" date: "2019/12/20" output: html_document
knitr::opts_chunk$set(echo = TRUE)
(1) You have already written an R function for Exercise 9.4 (page 277, Statistical Computing with R).Rewrite an Rcpp function for the same task.
9.4 Implement a random walk Metropolis sampler for generating the standard Laplace distribution (see Exercise 3.2). For the increment, simulate from a normal distribution. Compare the chains generated when different variances are used for the proposal distribution. Also, compute the acceptance rates of each chain.
## r library(GeneralizedHyperbolic) rw.Metropolis <- function(x0,N,sigma){ x <- numeric(N) x[1] <- x0 u <- runif(N) k <- 0 for (i in 2:N){ y <- rnorm(1,x[i-1],sigma) if (u[i] <= dskewlap(y) / dskewlap(x[i-1])) x[i] <- y else{ x[i] <- x[i-1] k <- k+1 } } return(list(x=x,k=k)) }
## C++ library(Rcpp) ### generate random numbers cppFunction('NumericVector rwMetropolisC(double x0, int N, double sigma){ NumericVector xC(N); xC[0] = x0; NumericVector u = runif(N); int k = 0; for(int i = 1; i < N; i++) { double y = (rnorm(1,xC[i-1],sigma))[0]; double res = exp(abs(xC[i-1]) - abs(y)); if(u[i] < res){ xC[i] = y; } else{ xC[i] = xC[i-1]; k ++;} }; return xC;}') ### generate k cppFunction('int rw_k_C(double x0, int N, double sigma){ NumericVector xC(N); xC[0] = x0; NumericVector u = runif(N); int k = 0; for(int i = 1; i < N; i++) { double y = (rnorm(1,xC[i-1],sigma))[0]; double res = exp(abs(xC[i-1]) - abs(y)); if(u[i] < res){ xC[i] = y; } else{ xC[i] = xC[i-1]; k ++;} }; return k;}')
(2) Compare the generated random numbers by the two functions using qqplot.
rw.Metropolis <- function(x0,N,sigma){ x <- numeric(N) x[1] <- x0 u <- runif(N) k <- 0 for (i in 2:N){ y <- rnorm(1,x[i-1],sigma) if (u[i] <= dskewlap(y) / dskewlap(x[i-1])) x[i] <- y else{ x[i] <- x[i-1] k <- k+1 } } return(list(x=x,k=k)) } sigma <- c(.1,.5,2,5,10) n <- length(sigma) N <- 2000 x0 <- rnorm(1) rwM <- matrix(nrow=N,ncol=n) rwM_C <- matrix(nrow=N,ncol=n) Num_of_Reject <- numeric(n) Num_of_Reject_C <- numeric(n) for (i in 1:n){ rwM[,i] <- rw.Metropolis(x0,N,sigma[i])$x Num_of_Reject[i] <- rw.Metropolis(x0,N,sigma[i])$k rwM_C[,i] <- rwMetropolisC(x0,N,sigma[i]) Num_of_Reject_C[i] <- rw_k_C(x0,N,sigma[i]) } Accept_rate_R <- round(1-Num_of_Reject/N,3) Accept_rate_Cpp <- round(1-Num_of_Reject_C/N,3) #compare random numbers qqplot(rwM, rwM_C, xlab = deparse(substitute(rwM)),ylab = deparse(substitute(rwM_C)),main="rwMetropolis with R and Cpp",col="blue") x <- seq(-6,6,.05) lines(x,x,col="red") #compare accept rate knitr::kable(rbind(Accept_rate_R,Accept_rate_Cpp),formate="html",col.names=c("0.1","0.5","2","5","10"))
(3) Campare the computation time of the two functions with microbenchmark.
library(microbenchmark) ts <- list(n) for (i in 1:n){ times <- microbenchmark(rwM=rw.Metropolis(x0,N,sigma[i])$x, rwM_C=rwMetropolisC(x0,N,sigma[i])) ts[[i]] <- summary(times)[,c(1,3,5,6)] } ts
From the qqplot we can see that the random numbers generated from R and C++ respectively are approximately lining near the line $y=x$ which indicates that these two groups of data are from the same distribution.
The times generating two groups of data by R and C++ respectively are significantly different. C++ spends 66 times less than R in average.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.