inst/doc/AllHomework.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## -----------------------------------------------------------------------------
library(knitr)
library(xtable)

## -----------------------------------------------------------------------------
lm <- lm(Petal.Length~Petal.Width,data = iris)
plot(lm)

## -----------------------------------------------------------------------------
irisset <- subset(iris,Species=="setosa")
boxplot(irisset[,1:4],main="setosa",col = c(1,2,3,4))

## -----------------------------------------------------------------------------
knitr::kable(head(iris[,1:4]))

## ----xtable,results='asis'----------------------------------------------------
print(xtable(head(iris[,1:4])),type = "html")

## -----------------------------------------------------------------------------
set.seed(1234)

# generate n random number from Rayleigh distribution, the parameter is sigma
rRayleigh <- function(n,sigma){
  u <- runif(n)
  x <- sqrt(-2*sigma^2*log(1-u))
  return(x)
}

# generate sample with sigma=5
n <- 1000
sigma <- 5
x <- rRayleigh(n,sigma)

# check generation algorithm is correct
hist(x,probability = T,main = expression(sigma==5))
y <- seq(0,20,.01)
lines(y,y/25*exp(-y^2/50))

# check the mode of samples is close to the sigma
hist(x=rRayleigh(n,sigma=2),main = expression(sigma==2))
hist(x=rRayleigh(n,sigma=4),main = expression(sigma==4))
hist(x=rRayleigh(n,sigma=5),main = expression(sigma==5))
hist(x=rRayleigh(n,sigma=10),main = expression(sigma==10))

## -----------------------------------------------------------------------------
set.seed(1234)
n <- 1000  # sample size = 1000

# generate x1,x2,u
x1 <- rnorm(n,mean = 0,sd=1)
x2 <- rnorm(n,mean = 3,sd=1)
u <-runif(n)

# generate x with mixture distribution
rmixture <- function(p1=0.5,p2){
  x <- vector(length = n)
  for(i in 1:n){
    if(u[i]<=p1) x[i] <- x1[i]
    if(u[i]>p1) x[i] <- x2[i]
  }
  return(x)
}

# generate samples with p1=0.5
x <- rmixture(0.75,0.25)

# hist plot with density superimposed
hist(x,probability = T,main="p1=0.75",ylim = c(0,0.3))
lines(density(x))

# vary p1
for(p1 in seq(0.1,0.9,0.1)){
  x <- rmixture(p1,1-p1)
  hist(x,probability = T,main=c("p1=",p1),ylim = c(0,0.4))
  lines(density(x))
}

## -----------------------------------------------------------------------------
n <- 1000
rPG <- function(lambda,t,alpha,beta){
  x <- vector(length = n)
  for(i in 1:n){
    Nt <- rpois(1,lambda=lambda*t) # Nt ~ Poi(lambda*t)
    y <- rgamma(Nt,alpha,beta)
    x[i] <- sum(y)
  }
  return(x)
}

# compare the estimated mean and variance with the theoretical values
results <- matrix(0,nrow = 4,ncol = 5)
xx <- matrix(0,nrow = 4, ncol = n) # original samples
colnames(results) <- c("parameter","mean","true mean","var","true var")

# several choices of the parameters
Lambda <- c(1,1,3,5)
Alpha <- c(1,1,2,3)
Beta <- c(1,2,1,0.5)
for(i in 1:4){
  xx[i,] <- rPG(Lambda[i],10,Alpha[i],Beta[i])
}

results[,2] <- round(rowMeans(xx),4)
results[,3] <- Lambda*10*Alpha/Beta
results[,5] <- Lambda*10*Alpha*(1+Alpha)/(Beta^2)
for(i in 1:4){
  results[i,1] <- paste("lambda=",Lambda[i],",alpha=",Alpha[i],",beta=",Beta[i])
  results[i,4] <- round(var(xx[i,]),4)
}

library(knitr)
knitr::kable(results)

## -----------------------------------------------------------------------------
set.seed(121)
x <- seq(.1,.9,length=9) # for some x in (0,1)
m <- 10000
y <- runif(m)
cdf <- numeric(length(x)) # F(x)

for(i in 1:length(x)){
  if(x[i]<=0 || x[i] >=1) cdf[i]<-0
  else{
    g <- 30*x[i]^3*y^2*(rep(1,m)-x[i]*y)^2
    cdf[i] <- mean(g)
  }
}
result <- rbind(x,cdf,pbeta(x,3,3))
rownames(result) <- c('x','cdf',"pbeta")
knitr::kable(result)

## -----------------------------------------------------------------------------
# generate samples
MC.Rayleigh <- function(sigma, m=10000, antithetic=TRUE){
  u <- runif(m)
  if(!antithetic) v <- runif(m) # independent
  else v <- 1-u
  x <- (sqrt(-2*sigma^2*log(1-u))+sqrt(-2*sigma^2*log(1-v)))/2
  return(x)
}

sigma <- c(0.5,1,2,5,10,15)
var_re <- numeric(length(sigma)) # reduction of variance
for(i in 1:length(sigma)){
  MC1 <- MC.Rayleigh(sigma[i],m=1000,antithetic = FALSE)
  MC2 <- MC.Rayleigh(sigma[i],m=1000,antithetic = TRUE)
  var_re[i] <- (var(MC1)-var(MC2))/var(MC1)
}

result2 <- rbind(sigma,var_re)
knitr::kable(result2)

## -----------------------------------------------------------------------------
m <- 10000
theta.hat <- var <- numeric(2)

g <- function(x){
  x^2*exp(-x^2/2)*(x>1)/(sqrt(2*pi))
}


# f1
x <- rnorm(m,mean = 1.5)
g_f <- g(x)/dnorm(x,mean = 1.5)
theta.hat[1] <- mean(g_f)
var[1] <- var(g_f)

# f2
u <- runif(m)
x <- 1/(1-u) # inverse transformation
g_f <- g(x)*x^2
theta.hat[2] <- mean(g_f)
var[2] <- var(g_f)

theta.hat
var

## -----------------------------------------------------------------------------
integrate(g,1,upper = Inf) # true value
#plot
y <- seq(1,10,.01)
plot(y,g(y),type = "l",main = "",ylab = "",ylim = c(0,2),col="red")
lines(y,dnorm(y,mean = 1.5),lty=2,col=3)
lines(y,1/(y^2),lty=3,col=4)

## -----------------------------------------------------------------------------
set.seed(200)
m <- 10000
g <- function(x){
  x^2*exp(-x^2/2)*(x>1)/(sqrt(2*pi))
}


# f1
x <- rnorm(m,mean = 1.5)
g_f <- g(x)/dnorm(x,mean = 1.5)
estimate <- mean(g_f)
true <- integrate(g,1,upper = Inf)$value # true value
c(estimate,true)

## -----------------------------------------------------------------------------
# Symmetric t-intervals
n <- 20 # sample size
alpha <- 0.05
set.seed(200)
CL <- replicate(1000,expr = { 
  x <- rchisq(n,df=2)
  L <- mean(x)-qt(1-alpha/2,df=n-1)*sd(x)/sqrt(n)   # Left end of interval
  R <-  mean(x)+qt(1-alpha/2,df=n-1)*sd(x)/sqrt(n)  # Right end of interval
  UCL <- (n-1)*var(x)/qchisq(alpha,df=n-1) 
  c(L,R,UCL)
})

# Calculate the empirical confidence level
m <- 0
for(i in 1:1000){
  if(CL[1,i]<=2&&CL[2,i]>=2) m <- m+1
}
m/1000

mean(CL[3,]>4) # The variance range

## -----------------------------------------------------------------------------
# Type I error
m <- 10000 
n <- 20    # sample size
alpha <- 0.05
p_chi <- p_unif <- p_expr <- numeric(m) # p values for each trial repeated

# three distributions
set.seed(123)
for(i in 1:m){
  chi <- rchisq(n,df=1)
  unif <- runif(n,0,2)
  expr <- rexp(n ,rate = 1)
  test1 <- t.test(chi,alternative="two.sided",mu=1,conf.level=0.95)
  test2 <- t.test(unif,alternative="two.sided",mu=1,conf.level=0.95)
  test3 <- t.test(expr,alternative="two.sided",mu=1,conf.level=0.95)
  p_chi[i] <- test1$p.value
  p_unif[i] <- test2$p.value
  p_expr[i] <- test3$p.value
}
p_chi.hat <- mean(p_chi < alpha)
p_unif.hat <- mean(p_unif < alpha)
p_expr.hat <- mean(p_expr < alpha)
c(p_chi.hat,p_unif.hat,p_expr.hat)

## -----------------------------------------------------------------------------
mat <-
  matrix(c(6510, 3490, 10000, 6760, 3240, 10000, 13270, 6730, 20000), 3, 3,
         dimnames = list(
           c("Rejected", "Accepted", "total"),
           c("method A", "method B", "total")
         ))
mat

## ----eval=FALSE---------------------------------------------------------------
#  nn <- c(10,20,30,50,100,500)  # sample size
#  alpha <- 0.05                 # significance level
#  d <- 2                        # The dimension of a random variable
#  b0 <- qchisq(1-alpha,df=d*(d+1)*(d+2)/6)*6/nn  # Each sample size threshold vector
#  
#  # Calculate multivariate sample skewness statistics
#  mul.sk <- function(x){
#    n <- nrow(x)
#    xbar <- colMeans(x)
#    sigma.hat <- (n-1)/n*cov(x) # MLE
#    V <- solve(sigma.hat)
#  
#    b <- 0
#    for(i in 1:nrow(x)){
#      for(j in 1:nrow(x)){
#        b <- b+((x[i,]-xbar)%*%V%*%(x[j,]-xbar))^3
#      }
#    }
#    return(b/(n^2))
#  }
#  
#  # calculate an empirical estimate of the first type of error
#  library(mvtnorm)
#  set.seed(200)
#  p.reject <- vector(mode = "numeric",length = length(nn))
#  
#  m <- 1000
#  
#  for(i in 1:length(nn)){
#    mul.sktests <- vector(mode = "numeric",length = m)
#    for(j in 1:m){
#      data <- rmvnorm(nn[i],mean = rep(0,d))
#      mul.sktests[j] <- as.integer(mul.sk(data)>b0[i])
#    }
#    p.reject[i] <- mean(mul.sktests)
#  }
#  p.reject

## ----eval=FALSE---------------------------------------------------------------
#  summ <- rbind(nn,p.reject)
#  rownames(summ) <- c("n","estimate")
#  knitr::kable(summ)

## ----eval=FALSE---------------------------------------------------------------
#  alpha <- 0.1
#  n <- 30      # sample size
#  m <- 2000
#  epsilon <- c(seq(0,0.15,0.01),seq(0.15,1,0.05))
#  N <- length(epsilon)
#  power <- vector(mode = "numeric",length = N)
#  b0 <- qchisq(1-alpha,df=d*(d+1)*(d+2)/6)*6/n  #Critical values
#  
#  # Separate power for this column of epsilon
#  for(j in 1:N){
#    e <- epsilon[j]
#    mul.sktests <- numeric(m)
#    for(i in 1:m){
#      # Generate mixed distribution
#      u <- sample(c(1,0),size = n,replace = T,prob = c(1-e,e))
#      data1 <- rmvnorm(n,sigma = diag(1,d))
#      data2 <- rmvnorm(n,sigma = diag(100,d))
#      data <- u*data1+(1-u)*data2
#      mul.sktests[i] <- as.integer(mul.sk(data)>b0)
#    }
#    power[j] <- mean(mul.sktests)
#  }
#  
#  # Plot the power function
#  plot(epsilon,power,type="b",xlab=bquote(epsilon),ylim=c(0,1))
#  abline(h=0.1,lty=3,col="lightblue")
#  se <- sqrt(power*(1-power)/m)
#  lines(epsilon,power-se,lty=3)
#  lines(epsilon,power+se,lty=3)

## -----------------------------------------------------------------------------
library(boot);library(bootstrap)
data(scor)

set.seed(200)
lambda_hat <- eigen(cov(scor))$values
theta_hat <- lambda_hat[1]/sum(lambda_hat)
B <- 2000     
boot.theta <- function(x,index){
  lambda <- eigen(cov(x[index,]))$values
  theta <- lambda[1]/sum(lambda)
  return(theta)
}

bootstrap_result <- boot(data = scor,statistic = boot.theta,R=B)
theta_b <- bootstrap_result$t
bias_boot <- mean(theta_b)-theta_hat   # bias
se_boot <- sqrt(var(theta_b))          # se
c(bias_boot=bias_boot,se_boot=se_boot)

## -----------------------------------------------------------------------------
# Jackknife
n <- nrow(scor)
theta_jack <- rep(0,n)
for(i in 1:n){
  x <- scor[-i,]
  lambda <- eigen(cov(x))$values
  theta_jack[i] <- lambda[1]/sum(lambda)
}
bias_jack <- (n-1)*(mean(theta_jack)-theta_hat) # bias
se_jack <- (n-1)*sqrt(var(theta_jack)/n)        # se
c(bias_jack=bias_jack,se_jack=se_jack)

## -----------------------------------------------------------------------------
ci <- boot.ci(bootstrap_result,type = c("perc","bca"))
ci

## -----------------------------------------------------------------------------
set.seed(200);m <- 2000
boot.skewness <- function(x,i){
  a <- sum((x[i]-mean(x[i]))^3)/length(x[i])
  b <- (sum((x[i]-mean(x[i]))^2)/length(x[i]))^(3/2)
  return(a/b)
}
ci.norm <- ci.basic <- ci.perc <- matrix(0,m,2)

# normal population
skew_normal <- 0
for(i in 1:m){
  xx <- rnorm(n=100,0,1)
  boot_res <- boot(data=xx,statistic=boot.skewness,R=999) # bootstrap
  ci <- boot.ci(boot_res,type=c("norm","basic","perc"))
  ci.norm[i,] <- ci$norm[2:3]
  ci.basic[i,] <- ci$basic[4:5]
  ci.perc[i,] <- ci$percent[4:5]
}

# Confidence interval coverage rate
coverage_rate <- c(mean(ci.norm[,1]<=skew_normal & ci.norm[,2]>=skew_normal),mean(ci.basic[,1]<=skew_normal & ci.basic[,2]>=skew_normal),mean(ci.perc[,1]<=skew_normal & ci.perc[,2]>=skew_normal))

# Percentage of misses on the left
miss_left <- c(mean(ci.norm[,1]>skew_normal),mean(ci.basic[,1]>skew_normal),mean(ci.perc[,1]>skew_normal))

# Proportion of misses on the right
miss_right <- c(mean(ci.norm[,2]<skew_normal),mean(ci.basic[,2]<skew_normal),mean(ci.perc[,2]<skew_normal))

cat('For normal populations, the estimate of the coverage probabilities of the normal bootstrap CI =',coverage_rate[1],",the basic CI =",coverage_rate[2],"and the percentile CI =",coverage_rate[3])

normal <- cbind(coverage_rate,miss_left,miss_right)
rownames(normal) <- c("normal","basic","percentile")
knitr::kable(normal)

## -----------------------------------------------------------------------------
# chi^2(5) distributions 
ci.norm <- ci.basic <- ci.perc <- matrix(0,m,2)
skew_chi <- sqrt(8/5)
for(i in 1:m){
  xx <- rchisq(n=100,df=5)
  boot_res <- boot(data=xx,statistic=boot.skewness,R=999) # bootstrap
  ci <- boot.ci(boot_res,type=c("norm","basic","perc"))
  ci.norm[i,] <- ci$norm[2:3]
  ci.basic[i,] <- ci$basic[4:5]
  ci.perc[i,] <- ci$percent[4:5]
}

coverage_rate <- c(mean(ci.norm[,1]<=skew_chi & ci.norm[,2]>=skew_chi),mean(ci.basic[,1]<=skew_chi & ci.basic[,2]>=skew_chi),mean(ci.perc[,1]<=skew_chi & ci.perc[,2]>=skew_chi))

# Percentage of misses on the left
miss_left <- c(mean(ci.norm[,1]>skew_chi),mean(ci.basic[,1]>skew_chi),mean(ci.perc[,1]>skew_chi))

miss_right <- c(mean(ci.norm[,2]<skew_chi),mean(ci.basic[,2]<skew_chi),mean(ci.perc[,2]<skew_chi))

cat('For chi-square distributions, the estimate of the coverage probabilities of the normal bootstrap CI =',coverage_rate[1],",the basic CI =",coverage_rate[2],"and the percentile CI =",coverage_rate[3])

chi <- cbind(coverage_rate,miss_left,miss_right)
rownames(chi) <- c("normal","basic","percentile")
knitr::kable(chi)

## -----------------------------------------------------------------------------
# Generate two related samples
library(MASS)
set.seed(200)
data <- mvrnorm(n=25,mu=c(1,2),Sigma = matrix(c(2,1,1,3),nrow=2,ncol=2))
x <- data[,1]
y <- data[,2]

# Permutation test
R <- 2000-1
z <- c(x,y)  # pooled sample
n1 <- length(x);n2 <- length(y)
K <- 1:(n1+n2)
reps <- vector(mode = "numeric",length = R)
rho_0 <- cor(x,y,method = "spearman")

for(i in 1:R){
  k <- sample(K, size = n1,replace = F)
  x1 <- z[k]
  y1 <- z[-k]
  reps[i] <- cor(x1,y1,method = "spearman")
}
p_permu <- mean(c(rho_0,reps)>=rho_0)
p_cortest <- cor.test(x,y)$p.value
round(c(p_permu,p_cortest),4)

# independent sample
data <- mvrnorm(n=25,mu=c(1,0),Sigma = matrix(c(2,0,0,3),nrow = 2))
x <- data[,1]
y <- data[,2]

# permutation test 
z <- c(x,y)
reps <- vector(mode = "numeric",length = R)
rho_0 <- cor(x,y,method = "spearman")

for(i in 1:R){
  k <- sample(K, size = n1,replace = F)
  x1 <- z[k]
  y1 <- z[-k]
  reps[i] <- cor(x1,y1,method = "spearman")
}
p_permu <- mean(abs(c(rho_0,reps))>=abs(rho_0))
p_cortest <- cor.test(x,y)$p.value
round(c(p_permu,p_cortest),4)

## ----eval=FALSE---------------------------------------------------------------
#  library(RANN)
#  library(boot)
#  library(energy)
#  library(Ball)
#  library(MASS)
#  
#  # NN
#  Tn <- function(z, index, sizes,k) {
#    n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2
#    if(is.vector(z)) z <- data.frame(z,0);
#    z <- z[index, ];
#    NN <- nn2(data=z, k=k+1)
#    block1 <- NN$nn.idx[1:n1,-1]
#    block2 <- NN$nn.idx[(n1+1):n,-1]
#    i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
#    (i1 + i2) / (k * n)
#  }
#  
#  eqdist.nn <- function(z,sizes,k){
#    boot.obj <- boot(data=z,statistic=Tn,R=R, sim = "permutation", sizes = sizes,k=k)
#    ts <- c(boot.obj$t0,boot.obj$t)
#    p.value <- mean(ts>=ts[1])
#    list(statistic=ts[1],p.value=p.value)
#  }

## ----eval=FALSE---------------------------------------------------------------
#  k <- 3;R <- 500-1;m <- 200
#  
#  # setting
#  n1 <- n2 <- 20
#  N <- c(n1,n2)
#  set.seed(123)
#  
#  # power
#  p.values <- matrix(NA,m,3)
#  for(i in 1:m){
#    x <- mvrnorm(n1,c(0,0),diag(1,2))
#    y <- mvrnorm(n2,c(0,0),diag(3,2))
#    z <- rbind(x,y)
#    p.values[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values[i,3] <- bd.test(x,y,num.permutations=R,seed=i*123)$p.value
#  }
#  alpha <- 0.1;
#  pow <- colMeans(p.values<alpha)
#  pow

## ----eval=FALSE---------------------------------------------------------------
#  k <- 3;R <- 500-1;m <- 200
#  
#  # setting
#  n1 <- n2 <- 20
#  N <- c(n1,n2)
#  set.seed(123)
#  
#  # power
#  p.values <- matrix(NA,m,3)
#  for(i in 1:m){
#    x <- rnorm(n1,-1,2)
#    y <- rnorm(n2,0.5,1)
#    z <- c(x,y)
#    p.values[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values[i,3] <- bd.test(x,y,num.permutations=R,seed=i*123)$p.value
#  }
#  alpha <- 0.05;
#  pow <- colMeans(p.values<alpha)
#  pow

## ----eval=FALSE---------------------------------------------------------------
#  k <- 3;R <- 500-1;m <- 200
#  
#  # setting
#  n1 <- n2 <- 20
#  N <- c(n1,n2)
#  set.seed(123)
#  
#  # power
#  p.values <- matrix(NA,m,3)
#  for(i in 1:m){
#    x <- rt(n1,df=1)
#    index<-sample(c(1,0),size=n2,replace=T,prob=c(0.5,0.5))
#    y<-index*rnorm(n1,0,1)+(1-index)*rnorm(n2,1,sqrt(0.5))
#    z <- c(x,y)
#    p.values[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values[i,3] <- bd.test(x,y,num.permutations=R,seed=i*123)$p.value
#  }
#  alpha <- 0.05;
#  pow <- colMeans(p.values<alpha)
#  pow

## ----eval=FALSE---------------------------------------------------------------
#  k <- 3;R <- 500-1;m <- 200
#  
#  # setting
#  n1 <- 10
#  n2 <- 100
#  N <- c(n1,n2)
#  set.seed(123)
#  
#  # power
#  p.values <- matrix(NA,m,3)
#  for(i in 1:m){
#    x <- mvrnorm(n1,c(0,0),diag(1,2))
#    y <- mvrnorm(n2,c(0,1),diag(3,2))
#    z <- rbind(x,y)
#    p.values[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values[i,3] <- bd.test(x,y,num.permutations=R,seed=i*123)$p.value
#  }
#  alpha <- 0.05;
#  pow <- colMeans(p.values<alpha)
#  pow

## -----------------------------------------------------------------------------
m <- 5000;set.seed(200);x <- numeric(m);u <- runif(m)
x[1] <- rnorm(1) # generate X_0
for(i in 2:m){
  xt <- x[i-1]
  y <- rnorm(1,mean = 0,sd = abs(xt))
  rt <- dcauchy(y)*dnorm(xt,0,abs(y))/(dcauchy(xt)*dnorm(y,0,abs(xt)))
  if(u[i]<=rt) x[i] <- y
  else x[i] <- xt 
}

## -----------------------------------------------------------------------------
data <- x[1001:m]
q <- seq(.1,.9,.1)
sample_deciles <- quantile(data,q)
cauchy_deciles <- qcauchy(q)

result <- rbind(sample_deciles,cauchy_deciles)
colnames(result) <- q
knitr::kable(result)

## -----------------------------------------------------------------------------
m <- 5000;set.seed(1234);x <- numeric(m);u <- runif(m)
x[1] <- rnorm(1) 
for(i in 2:m){
  xt <- x[i-1]
  y <- rnorm(1,mean = xt,sd = 1)
  rt <- dcauchy(y)*dnorm(xt,y,1)/(dcauchy(xt)*dnorm(y,xt,1))
  if(u[i]<=rt) x[i] <- y
  else x[i] <- xt 
}

data <- x[1001:m]
q <- seq(.1,.9,.1)
sample_deciles <- quantile(data,q)
cauchy_deciles <- qcauchy(q)

result <- rbind(sample_deciles,cauchy_deciles)
colnames(result) <- q
knitr::kable(result)

## -----------------------------------------------------------------------------
Gibbs_sample <- function(N,a,b,n){ # N is the length of chain
  X <- matrix(0,nrow = N,ncol = 2) 
  X[1,] <- c(floor(n/2),0.5)       # Initial value
  for(i in 2:N){
    y <- X[i-1,2]
    X[i,1] <- rbinom(1,size = n,prob = y) 
    x <- X[i,1]
    X[i,2] <- rbeta(1,x+a,n-x+b)          
  }
  return(X)
}

data <- Gibbs_sample(N=5000,a=5,b=5,n=50)
burn <- 1000
data <- data[(burn+1):5000,]        
plot(data,col="lightblue",pch=20,cex=0.6,xlab = "x",ylab = "y",ylim = c(0,1),xlim = c(0,50))

## ----eval=FALSE---------------------------------------------------------------
#  # Gelman-Rubin method
#  Gelman.Rubin <- function(psi){
#    psi <- as.matrix(psi)
#    n <- ncol(psi)
#    k <- nrow(psi)
#    psi.means <- rowMeans(psi)
#    B <- n*var(psi.means)
#    psi.w <- apply(psi,1,"var") # within variances
#    W <- mean(psi.w)
#    v.hat <- W*(n-1)/n + (B/n)
#    r.hat <- v.hat/W   # G-R statistic
#    return(r.hat)
#  }

## ----eval=FALSE---------------------------------------------------------------
#  # 9.3
#  cauchy.chain <- function(N,X1){
#    x <- rep(0,N)
#    x[1] <- X1
#    u <- runif(N)
#  for(i in 2:N){
#    xt <- x[i-1]
#    y <- rnorm(1,mean = xt,sd = 1)
#    rt <- dcauchy(y)*dnorm(xt,y,1)/(dcauchy(xt)*dnorm(y,xt,1))
#    if(u[i]<=rt) x[i] <- y
#    else x[i] <- xt
#  }
#    return(x)
#  }
#  
#  k <- 4
#  n <- 20000
#  b <- 2000  # burn-in length
#  
#  init <- c(-1.2,-1,-1.1,-0.5);set.seed(200) # Initial value
#  X <- matrix(0,nrow = k,ncol = n)
#  for(i in 1:k){
#    X[i,] <- cauchy.chain(n,init[i])
#  }
#  
#  # Calculate diagnostic statistics
#  psi <- t(apply(X,1,cumsum))
#  for(i in 1:nrow(psi)){
#    psi[i,] <- psi[i,]/(1:ncol(psi))
#  }
#  print(Gelman.Rubin(psi))
#  
#  # plot
#  rhat <- rep(0:n)
#  for(j in (b+1):n){
#    rhat[j] <- Gelman.Rubin(psi[,1:j])
#  }
#  plot(rhat[(b+1):n],type="l",xlab="",ylab="R")
#  abline(h=1.2,lty=2)

## ----eval=FALSE---------------------------------------------------------------
#  # 9.8
#  k <- 4
#  n <- 15000
#  burn <- 1000  # burn-in length
#  
#  
#  X <- matrix(0,nrow = k,ncol = n)
#  Y <- matrix(0,nrow = k,ncol = n)
#  set.seed(12345)
#  for(i in 1:k){
#    chain <- Gibbs_sample(n,5,5,30)
#    X[i,] <- chain[,1]
#    Y[i,] <- chain[,2]
#  }
#  
#  # Calculate diagnostic statistics of X
#  psi <- t(apply(X,1,cumsum))
#  for(i in 1:nrow(psi)){
#    psi[i,] <- psi[i,]/(1:ncol(psi))
#  }
#  print(Gelman.Rubin(psi))
#  
#  rhat <- rep(0:n)
#  for(j in (b+1):n){
#    rhat[j] <- Gelman.Rubin(psi[,1:j])
#  }
#  plot(rhat[(b+1):n],type="l",xlab="X",ylab="R")
#  abline(h=1.2,lty=2)
#  
#  # Calculate diagnostic statistics of Y
#  psi <- t(apply(Y,1,cumsum))
#  for(i in 1:nrow(psi)){
#    psi[i,] <- psi[i,]/(1:ncol(psi))
#  }
#  print(Gelman.Rubin(psi))
#  
#  rhat <- rep(0:n)
#  for(j in (b+1):n){
#    rhat[j] <- Gelman.Rubin(psi[,1:j])
#  }
#  plot(rhat[(b+1):n],type="l",xlab="Y",ylab="R")
#  abline(h=1.2,lty=2)

## -----------------------------------------------------------------------------
ak <- function(k,a){
  d <- length(a)      
  mode.a2 <- sum(a^2) 
  g1 <- exp(lgamma((d+1)/2)+lgamma(k+3/2)-lgamma(k+d/2+1)-lgamma(k+1)-k*log(2)) # k!=gamma(k+1)
  #g2 <- exp(-sum(log(1:k))-k*log(2))  # 1/(k!*2^k)
  return((-1)^k*mode.a2^(k+1)*g1/((2*k+1)*(2*k+2)))
}

## -----------------------------------------------------------------------------
summ <- function(K,a){
  sum(ak((0:K),a))
}

## -----------------------------------------------------------------------------
summ(200,c(1,2))

## -----------------------------------------------------------------------------
# 11.4
k <- c(4:25,100,500,1000)

s <- function(k,a){
  # S
  return(pt(sqrt(a^2*k/(k+1-a^2)),df=k,lower.tail = F))
}

delta.s <- function(k,a){
  s(k,a)-s(k-1,a)
}

n <- length(k)
A <- numeric(n)
for(i in 1:n){
  A[i] <- uniroot(delta.s,interval=c(1,2),k=k[i])$root
}

result <- cbind(k,A)
knitr::kable(result)

## -----------------------------------------------------------------------------
# 11.5
f1 <- function(k,a){
  integrate(dt, lower = 0, upper = sqrt(a^2*k/(k+1-a^2)) ,df = k)$value

}
f <- function(k, a){
  f1(k, a)-f1(k-1, a)
}

k <- c(4:25, 100, 500, 1000)
n <- length(k)
a <- numeric(n)
for(i in 1:n){
 a[i] <- uniroot(f, interval=c(1, 2), k = k[i])$root
}

result <- cbind(k,a)
knitr::kable(result)

## -----------------------------------------------------------------------------
y <- c(0.54,0.48,0.33,0.43,1.00,1.00,0.91,1.00,0.21,0.85)

f <- function(lambda){
  10/(6.75+3/lambda)
}

lambda.EM <- function(N,epsilon){
  lambda0 <- 1 
  for(i in 1:N){
    lambda1 <- f(lambda0)
    if(abs(lambda1-lambda0) < epsilon){
      return(lambda1) 
    }
    else lambda0 <- lambda1
  }
  print("EM algorithm is not covergent")
}

N <- 1000
epsilon <- 10^(-8)
EM <- lambda.EM(N,epsilon)

## -----------------------------------------------------------------------------
lambda.MLE <- 7/6.75
print(c(EM=EM,MLE=lambda.MLE))

## ----eval=FALSE---------------------------------------------------------------
#  trims <- c(0,0.1,0.2,0.5)
#  x <- rcauchy(100)
#  
#  lapply(trims,function(trim) mean(x,trim=trim))
#  lapply(trims, mean,x=x)

## -----------------------------------------------------------------------------
rsq <- function(mod) summary(mod)$r.squared

# 3
formulas <- list(
  mpg ~ disp,
  mpg ~ I(1/disp),
  mpg ~ disp + wt,
  mpg ~ I(1/disp) + wt
)
model1 <- lapply(formulas,lm,data=mtcars) # Use lapply()
unlist(lapply(model1,rsq)) # R2

loop1 <- vector("list",length = length(formulas)) # Use loops
for(i in seq_along(formulas)){
  loop1[[i]] <- lm(formulas[[i]],data = mtcars)
}
unlist(lapply(loop1,rsq)) # R2

# 4
bootstaps <- lapply(1:10, function(i){
  rows <- sample(1:nrow(mtcars),replace = T) # Resampling
  mtcars[rows,]
})

model2 <- lapply(bootstaps,lm,formula=mpg ~ disp) # Use lapply()
unlist(lapply(model2,rsq)) # R2

loop2 <- vector("list",length = length(bootstaps))
for(i in seq_along(bootstaps)){
  loop2[[i]] <- lm(mpg ~ disp,data=bootstaps[[i]])
}
unlist(lapply(loop2,rsq)) # R2

## -----------------------------------------------------------------------------
# numeric data frame
vapply(mtcars, sd ,numeric(1))  # Specify the return value type as numeric

## -----------------------------------------------------------------------------
# mixed data frame
data("iris")
sapply(iris, class)  
vapply(iris[vapply(iris, is.numeric, logical(1))],sd,numeric(1))

## ----eval=FALSE---------------------------------------------------------------
#  library(parallel)
#  detectCores()  # Check the number of cores currently available
#  
#  mcsapply <- function(x,fun){
#    cl <- makeCluster(detectCores())
#    results <- parSapply(cl,x,fun)
#    stopCluster(cl)
#    return(results)
#  }
#  
#  boot_lm <- function(i){
#    boot_df <- function(x) x[sample(nrow(x),replace = T),]
#    summary(lm(mpg~disp,data=boot_df(mtcars)))$r.squared
#  }
#  
#  n <- 4000
#  system.time(sapply(1:n,boot_lm))
#  system.time(mcsapply(1:n,boot_lm))

## ----eval=FALSE---------------------------------------------------------------
#  #include <Rcpp.h>
#  using namespace Rcpp;
#  
#  // [[Rcpp::export]]
#  NumericMatrix gibbsC(int N, double a, double b, int n, int thin) {
#    NumericMatrix mat(N, 2);
#    double x = floor(n/2), y = 0.5;
#    for(int i = 0; i < N; i++) {
#      for(int j = 0; j < thin; j++) {
#        x = rbinom(1, n, y )[0];
#        y = rbeta(1, x+a, n-x+b)[0];
#      }
#      mat(i, 0) = x;
#      mat(i, 1) = y;
#    }
#    return(mat);
#  }

## ----eval=FALSE---------------------------------------------------------------
#  library(Rcpp)
#  dir_cpp <- '../Rcpp/'
#  sourceCpp(paste0(dir_cpp,"gibbsC.cpp"))

## ----eval=FALSE---------------------------------------------------------------
#  gibbsR <- function(N,a,b,n, thin){ # N is the length of chain
#    X <- matrix(0,nrow = N,ncol = 2)
#    x <- floor(n/2)
#    y <- 0.5
#    for(i in 2:N){
#      for(j in 1:thin){
#        x <- rbinom(1,size = n,prob = y) #
#        y <- rbeta(1,x+a,n-x+b)
#      }
#      X[i,] <- c(x,y)
#    }
#    return(X)
#  }
liuhyustc/StatComp21017 documentation built on Dec. 23, 2021, 11:16 p.m.