## ------------------------------------------------------------------------
set.seed(10)
x <- rnorm(10,3,2)
print(x)
## ------------------------------------------------------------------------
a <- "Double quotes \" delimitate R¡¯s strings."
print(a)
## ------------------------------------------------------------------------
set.seed(10)
y<-rnorm(1000,2,3)
hist(y,breaks = 30)
## ------------------------------------------------------------------------
library(ggplot2)
set.seed(123)#guarantee that the program can be repeated with the same result
x <- 0:4#define r.v. x
p <- c(0.1,0.2,0.2,0.2,0.3)#the relative probability of x
cp <- cumsum(p)#cumulative sum of probability
m <- 1000#size of samples
r <- numeric(m)#construct a zero vector with length of 1000
r <- x[findInterval(runif(m),cp)+1]
# Generate 1000 sample from a uniform distribution and find where does every sample fall in the interval of cp.Every interval corresponds a value of random variable x.
table_r <- table(r)
print(table_r)
#theoretical probability
t_p <- p*1000
names(t_p) <- x
print(t_p)#show the theoretical 1000 samples
a <- data.frame(x,freq = c(105,193,205,200,297,100,200,200,200,300),type = rep(c('random sample','theoretical sample'),each = 5))
ggplot(a,aes(x = x ,y =freq ,fill = type))+geom_col(position = 'dodge')
## ------------------------------------------------------------------------
set.seed(300)
library(ggplot2)
beta_arm <- function(n,a,b){
beta_pdf <- function(x){
(1/beta(a,b))*(x)^(a-1)*(1-x)^(b-1)#define the pdf of beta distribution
}
j <- 0#the time of iteration
k <- 0#the available sample
y <- numeric(n)#a zero vector of length n
while (k < n){
u <- runif(1)
j <- j + 1
x <- runif(1) #random variate from g
if ((a-1)*x^(a-2) * (b-1)*(1-x)^(b-2) > u) {
#accept x
k <- k + 1
y[k] <- x
}
}
return(y)
}
sample_beta <- beta_arm(1000,3,2)#record the sample
head(sample_beta,20)
sample_beta_theo <- rbeta(1000,3,2)#generate the sample by using the function "rbeta"
data_beta <- data.frame(sample = c(sample_beta,sample_beta_theo),class = rep(c('empirical','theoretical'),each = 1000))#construst a data frame in order to generate a figure
ggplot(data_beta,aes(x = sample,fill = class))+geom_histogram(position="identity", alpha=0.5)
## ------------------------------------------------------------------------
set.seed(100)
library(ggplot2)
n <- 1000
r <- 4
beta <- 2#define the parameters
lambda <- rgamma(n, r, beta)#generate the random sample from the gamma distribution with r=4, beta=2
x <- rexp(n, lambda)
head(x,30)#list the first 30 samples
x_data <- data.frame(x1 = x)
ggplot(x_data,aes(x = x1))+geom_histogram(fill = 'skyblue',alpha =0.7,binwidth = 0.4)
## ----ex5.4---------------------------------------------------------------
set.seed(111)
library(ggplot2)
beta33_cdf <- function(n,a,b){
x <- runif(n,a,b)
cdf_est <- (sum(30*(x^2-2*x^3+x^4))/n)*(b-a)#Monta Carlo method
return(cdf_est)
}
a <- data.frame(x = seq(0.1,0.9,0.1), MontaCarlo = numeric(9),pbeta = numeric(9))#construct a data frame to show the value of cdf
i <- 1
while(i<=9){
a[i,2] <- beta33_cdf(10000,0,i*0.1)
a[i,3] <- pbeta(i*0.1,3,3)
i <- i+1
}
print(a)
b <- data.frame(x = c(a$x,a$x),pdf = c(a$MontaCarlo,a$pbeta),class = rep(c('MontaCarlo','pbeta'),each=9))
#contruct b in order to use ggplot2 easily
ggplot(data = b)+
geom_col(aes(x = x,y = pdf, fill = class),position = 'dodge')+scale_fill_brewer(palette = 'Set1')
## ----ex5.9---------------------------------------------------------------
set.seed(100)
Rayleigh <- function(n,sigma,ant = T){
m <- floor(n/2)
u <- runif(m)
if(ant) v <- 1-u else v <- runif(m)
u <- c(u,v)
sample <- numeric(length(u))
sample <- (-(2*sigma^2)*(log(1-u)))^(1/2)
return(sample)
}
sample <- Rayleigh(1000,4)
head(sample,20)#out of simplicity, only 20 samples are printed
## ----variance reduction--------------------------------------------------
set.seed(100)
library(ggplot2)
idpt_sample <- Rayleigh(4000,2,ant=F)#use Rayleigh function to obtain 4000 independent samples
ant_sample <- Rayleigh(4000,2,ant=T)#use Rayleigh function to obtain 4000 antithetic samples
var_idpt_sample <- var((idpt_sample[1:2000]+idpt_sample[2001:4000])/2)#compute the variance of independent samples
var_ant_sample <- var((ant_sample[1:2000]+ant_sample[2001:4000])/2)#compute the variance of antithetic samples
cat('The variance of independent sample is',var_idpt_sample,'.\n
The variance of ant_sample is',var_ant_sample,'.\n
The pecent of variance reduction is', (var_idpt_sample-var_ant_sample)/var_idpt_sample,
'.\n\nThe covariance of independent sample is', cov(idpt_sample[1:2000],idpt_sample[2001:4000]),', while the covariance of antithetic sample is', (cov(ant_sample[1:2000],ant_sample[2001:4000])))#show the variances
a <- data.frame(class <- c('independent sample','antithetic sample'),
variance <- c(var_idpt_sample,var_ant_sample))
ggplot(a)+geom_col(aes(x = class,y = variance),fill = 'LightSkyBlue3')+ylim(0,1)+
coord_flip()#use figure to compare the variance reduction
## ----ex5.13--------------------------------------------------------------
set.seed(100)
n <- 10000
theta.hat <- numeric(2)
se <- numeric(2)
g <- function(x){
x^2*exp(-(x^2)/2)/(2*pi)^(1/2)*(x>=1)
}
x <- rexp(n,1)
fg <- g(x)/exp(-x)
theta.hat[1] <- mean(fg)
se[1] <- sd(fg)
x <- Rayleigh(n,1,ant = F)
fg <- g(x)/(x*exp(-(x^2)/2))
theta.hat[2] <- mean(fg)
se[2] <- sd(fg)
integrate(g,1,Inf)
a <- data.frame(theta.hat,se)
print(a)
## ----ex5.14--------------------------------------------------------------
set.seed(100)
n <- 10000
g <- function(x){
x^2*exp(-(x^2)/2)/(2*pi)^(1/2)*(x>=1)
}
x <- Rayleigh(n,1,ant = F)#density of g(x)/f(x)
fg <- g(x)/(x*exp(-(x^2)/2))#g(x)/f(x)
cat(mean(fg),'\n')#estimate the integral
integrate(g,1,Inf)
## ----ex6.9 lognormal,message=F-------------------------------------------
library(tidyverse)
set.seed(100)
m <- n <- 1000
G_lnorm <- function(n){#a function to compute G-hat
x <- rlnorm(n)
x <- sort(x)#generate order statistics
sum_x <- 0
for(i in 1:n) sum_x <- sum_x+(2*i-n-1)*x[i]
sum_x <- sum_x/(n^2*mean(x))
return(sum_x)
}
G1 <- numeric(m)#m estimations of G
for(j in 1:m) G1[j] <- G_lnorm(n)
cat('The mean of G-hat is',mean(G1),',the median of G-hat is',median(G1),',and the deciles are',quantile(G1,probs = seq(0,1,0.1)))
G1 <- tibble(G1)
ggplot(G1,aes(x = G1))+geom_histogram(fill = 'royalblue',alpha = 0.7)
## ----ex6.9 uniform,message=F---------------------------------------------
G_unif <- function(n){#a function to compute G-hat
x <- runif(n)
x <- sort(x)#generate order statistics
sum_x <- 0
for(i in 1:n) sum_x <- sum_x+(2*i-n-1)*x[i]
sum_x <- sum_x/(n^2*mean(x))
return(sum_x)
}
G2 <- numeric(m)#m estimations of G
for(j in 1:m) G2[j] <- G_unif(n)
cat('The mean of G-hat is',mean(G2),',the median of G-hat is',median(G2),',and the deciles are',quantile(G2,probs = seq(0,1,0.1)))
G2 <- tibble(G2)
ggplot(G2,aes(x = G2))+geom_histogram(fill = "CadetBlue3",alpha = 0.7)
## ----ex6.9 Bernoulli,message=F-------------------------------------------
G_bernoulli <- function(n){#a function to compute G-hat
x <- rbernoulli(n)
x <- sort(x)#generate order statistics
sum_x <- 0
for(i in 1:n) sum_x <- sum_x+(2*i-n-1)*x[i]
sum_x <- sum_x/(n^2*mean(x))
return(sum_x)
}
G3 <- numeric(m)#m estimations of G
for(j in 1:m) G3[j] <- G_bernoulli(n)
cat('The mean of G-hat is',mean(G3),',the median of G-hat is',median(G3),',and the deciles are',quantile(G3,probs = seq(0,1,0.1)))
G3 <- tibble(G3)
ggplot(G3,aes(x = G3))+geom_histogram(fill = 'yellow2',alpha = 0.7)
## ----ex6.10--------------------------------------------------------------
set.seed(100)
n <- 200#the size of x to generate G
m <- 100#the size of G to compute mean of G and sd of G
k <- 200#the size of gamma to estimate the coverage rate
G4 <- numeric(m)
for(i in 1:m) G4[i] <- G_lnorm(n)
gamma_mean <- mean(G4)
gamma_sd <- sd(G4)
#confidence interval (a,b)
a <- gamma_mean-qt(0.975,df = m-1)*gamma_sd/sqrt(m)
b <- gamma_mean+qt(0.975,df = m-1)*gamma_sd/sqrt(m)
gamma <- numeric(k)
#generate another 300 gammas to find whether the confidence interval is valid
for(i in 1:k){
G <- numeric(m)
for(j in 1:m) G[j] <- G_lnorm(n)
gamma[i] <- mean(G)
}
cat('The confidence interval of gamma is (',round(a,4),',',round(b,4),'), and the coverage rate is about ',round((sum(gamma>a&gamma<b)/k),3)*100,'%',sep = '')
## ----ex6.B---------------------------------------------------------------
library(MASS)
set.seed(14)
n <- 30
m <- 5000
pearson_p <- numeric(m)
kendall_p <- numeric(m)
spearman_p <- numeric(m)
sigma <- matrix(c(1,0.3,0.3,1),ncol = 2)
mean <- c(0,1)
#MC
for(i in 1:m){
mltinorm <- mvrnorm(n,mean,sigma)
x <- mltinorm [, 1]
y <- mltinorm [, 2]
#using three methods to implement tests for association
pearson <- cor.test(x,y,method = 'pearson')
kendall <- cor.test(x,y,method = 'kendall')
spearman <- cor.test(x,y,method = 'spearman')
pearson_p[i] <- pearson$p.value
kendall_p[i] <- kendall$p.value
spearman_p[i] <- spearman$p.value
}
#compute the rate that null hypothesis is rejected
cat('pearson:',sum(pearson_p<0.05)/m,
'kendall:',sum(kendall_p<0.05)/m,
'spearman:',sum(spearman_p<0.05)/m)
rate <- tibble(methods = c('pearson','kendall','spearman'),reject_percent = c(sum(pearson_p<0.05)/m,sum(kendall_p<0.05)/m,sum(spearman_p<0.05)/m))
rate$methods <- factor(rate$methods,levels = c('pearson','kendall','spearman'))
ggplot(rate,aes(x = methods,y = reject_percent))+
geom_col(fill = 'Salmon')
## ----ex6.B-2-------------------------------------------------------------
set.seed(6809)
n <- 30
m <- 5000
pearson_p <- numeric(m)
kendall_p <- numeric(m)
spearman_p <- numeric(m)
sigma <- matrix(c(1,0,0,1),ncol = 2)
mean <- c(0,1)
#MC
for(i in 1:m){
mltinorm <- mvrnorm(n, mean, sigma)
x <- mltinorm [, 1]
y <- mltinorm [, 2]
#using three methods to implement tests for association
pearson <- cor.test(x, y, method = 'pearson')
kendall <- cor.test(x, y, method = 'kendall')
spearman <- cor.test(x, y, method = 'spearman')
pearson_p[i] <- pearson$p.value
kendall_p[i] <- kendall$p.value
spearman_p[i] <- spearman$p.value
}
#compute the times that alternative hypothesis is accepted
cat('pearson:',sum(pearson_p<0.05)/m,
'kendall:',sum(kendall_p<0.05)/m,
'spearman:',sum(spearman_p<0.05)/m)
rate <- tibble(methods = c('pearson','kendall','spearman'),reject_percent = c(sum(pearson_p<0.05)/m,sum(kendall_p<0.05)/m,sum(spearman_p<0.05)/m))
rate$methods <- factor(rate$methods,levels = c('pearson','kendall','spearman'))
ggplot(rate,aes(x = methods,y = reject_percent))+
geom_col(fill = 'NavajoWhite2')
## ----exercise 7.1--------------------------------------------------------
set.seed(1)
library(bootstrap)
data(law)
n <- nrow(law)
cor_law <- cor(law$LSAT, law$GPA)
cor_law_jack <- numeric(n)
for (i in 1:n)
cor_law_jack[i] <- cor(law$LSAT[-i], law$GPA[-i])
#bias
bias_cor_law <- (n - 1) * (mean(cor_law_jack) - cor_law)
cat(
'The bias of the correlation statistic computed by jackknife estimation is',
bias_cor_law,
'\n'
)
#standard error
se_cor_law <- sqrt((n - 1) * mean((cor_law_jack - mean(cor_law_jack)) ^ 2))
cat(
'The standard error of the correlation statistic computed by jackknife estimation is',
se_cor_law
)
## ----exercise 7.5,warning=F----------------------------------------------
library(boot)
data("aircondit")
mean_airc <- function(x, i) {
a <- x[i, 1]
return(mean(a))
}
boot.obj <- boot(aircondit, statistic = mean_airc, R = 300)
boot_ci <-
boot.ci(boot.obj, type = c("basic", "norm", "perc", "bca"))
print(boot_ci)
## ----exercise 7.8--------------------------------------------------------
data(scor)
scor_pca <- princomp(scor, cor = F)
scor_pca_summary <- summary(scor_pca)
#the proportion of variance explained by the first principal component is 0.619115
theta.hat <- 0.619115
#jackknife
n <- nrow(scor)
theta.hat.jackknife <- numeric(n)
for (i in 1:n) {
scor_jackknife <- scor[-i, ]
scor_jackknife_cov <- cov(scor_jackknife)
theta.hat.jackknife[i] <-
eigen(scor_jackknife_cov)$value[1] / sum(eigen(scor_jackknife_cov)$value)
}
#bias
bias_theta <- (n - 1) * (mean(theta.hat.jackknife) - theta.hat)
cat(
'The bias of the theta hat computed by jackknife estimation is',
bias_theta,
'\n'
)
#standard error
se_theta <-
sqrt((n - 1) * mean((theta.hat.jackknife - mean(theta.hat.jackknife)) ^ 2))
cat(
'The standard error of the theta hat computed by jackknife estimation is',
se_cor_law
)
## ----exercise 7.11,message=F---------------------------------------------
library(DAAG)
library(tidyverse)
attach(ironslag)
L1 <- lm(magnetic ~ chemical)
L2 <- lm(magnetic ~ chemical + I(chemical ^ 2))
L3 <- lm(log(magnetic) ~ chemical)
L4 <- lm(log(magnetic) ~ log(chemical))
n <- length(ironslag$magnetic) / 2
e1 <- e2 <- e3 <- e4 <- numeric(n * 2)
m <-
##leave-two-out
for (k in (1:n)*2) {
y <- magnetic[-c(k-1,k)]
x <- chemical[-c(k-1,k)]
J1 <- lm(y ~ x)
yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k-1]
e1[k-1] <- magnetic[k-1] - yhat1#error between estimation and observation
yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k]
e1[k] <- magnetic[k] - yhat1#error between estimation and observation
J2 <- lm(y ~ x + I(x^2))
yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k-1] + J2$coef[3] * chemical[k-1]^2
e2[k-1] <- magnetic[k-1] - yhat2#error between estimation and observation
yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] + J2$coef[3] * chemical[k]^2
e2[k] <- magnetic[k] - yhat2#error between estimation and observation
J3 <- lm(log(y) ~ x)
logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k-1]
yhat3 <- exp(logyhat3)
e3[k-1] <- magnetic[k-1] - yhat3#error between estimation and observation
logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k]
yhat3 <- exp(logyhat3)
e3[k] <- magnetic[k] - yhat3#error between estimation and observation
J4 <- lm(log(y) ~ log(x))
logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k-1])
yhat4 <- exp(logyhat4)
e4[k-1] <- magnetic[k-1] - yhat4#error between estimation and observation
logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k])
yhat4 <- exp(logyhat4)
e4[k] <- magnetic[k] - yhat4#error between estimation and observation
}
detach()
print(c(mean(e1 ^ 2), mean(e2 ^ 2), mean(e3 ^ 2), mean(e4 ^ 2)))
#choose the model with the least average error
error <- data.frame(model = paste(c('L'),1:4,sep = ''),error = c(mean(e1 ^ 2), mean(e2 ^ 2), mean(e3 ^ 2), mean(e4 ^ 2)))
ggplot(error,aes(x=model,y = error))+
geom_col(fill = 'wheat2')+
xlab('errors of four models')
## ----exercise8.1, message=F----------------------------------------------
set.seed(1)
library("nortest")
attach(chickwts)
x <- sort(as.vector(weight[feed == "soybean"]))
y <- sort(as.vector(weight[feed == "linseed"]))
detach(chickwts)
z <- c(x,y)#pooled sample
R <- 999#number of replicates
K <- 1:length(z)
D <- numeric(R)#used to store statistics
CM <- function(x,y){#a function to compute the statistic of Cramer-von Mises
ecdfx <- ecdf(x)
ecdfy <- ecdf(y)
l_x <- length(x)
l_y <- length(y)
sum1 <- sum((ecdfx(x)-ecdfy(x))^2)
sum2 <- sum((ecdfx(y)-ecdfy(y))^2)
w <- l_x*l_y/(l_x+l_y)^2*(sum1+sum2)
return(w)
}
D0 <- CM(x,y)
for(i in 1:R) {
k <- sample(K, size = length(x), replace = F)
x1 <- z[k]
y1 <- z[-k]
D[i] <- CM(x1,y1)
}
p <- mean(c(D0, D) >= D0)
print(p)
hist(D, main = "", freq = FALSE, xlab = "D (p = 0.401)", breaks = "scott")
points(D0, 0, cex = 1, pch = 16)
## ----message=F-----------------------------------------------------------
library(RANN)
library(boot)
library(energy)
library(Ball)
library(ggplot2)
library(survival)
m <- 30#times to loop
k<-3
p<-2#ncol
# mu <- 0.5
n1 <- n2 <- 50#nrow
R<-999#the number of replications in the bd.test function
n <- n1+n2
N = c(n1,n2)
Tn <- function(z, ix, sizes,k) {
n1 <- sizes[1]
n2 <- sizes[2]
n <- n1 + n2
if(is.vector(z)) z <- data.frame(z,0)
z <- z[ix, ]
NN <- nn2(data=z, k=k+1)
block1 <- NN$nn.idx[1:n1,-1]
block2 <- NN$nn.idx[(n1+1):n,-1]
i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
(i1 + i2) / (k * n)
}
eqdist.nn <- function(z,sizes,k){#NN
boot.obj <- boot(data=z,statistic=Tn,R=R,
sim = "permutation", sizes = sizes,k=k)
ts <- c(boot.obj$t0,boot.obj$t)
p.value <- mean(ts>=ts[1])
list(statistic=ts[1],p.value=p.value)
}
p.values <- matrix(NA,m,3)#to store p values
for(i in 1:m) {
x <- matrix(rnorm(n1 * p, sd = 1), ncol = p)#unequal variances
y <- matrix(rnorm(n2 * p, sd = 1.4), ncol = p)
z <- rbind(x, y)
p.values[i, 1] <- eqdist.nn(z, N, k)$p.value#NN
p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value#in the energy package
p.values[i, 3] <- bd.test(x = x, y = y, R = 999, seed = i)$p.value#"Ball Divergence" in the ball package
}
alpha <- 0.1#confidence level
pow <- apply(p.values<alpha,2,mean)#compute the number of p.values which is less than 0.1 in each column
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
geom_col(fill = 'palegreen3')+
coord_flip()
## ------------------------------------------------------------------------
for(i in 1:m) {
x <- matrix(rnorm(n1 * p, mean = 0.4, sd = 1), ncol = p)#unequal variances and unequal expectations
y <- matrix(rnorm(n2 * p, mean = 0, sd = 1.4), ncol = p)
z <- rbind(x, y)
p.values[i, 1] <- eqdist.nn(z, N, k)$p.value
p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value
p.values[i, 3] <- bd.test(x = x, y = y, R = 999, seed = i)$p.value
}
alpha <- 0.1
pow <- apply(p.values<alpha,2,mean)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+
geom_col(fill = 'lightslategrey')+
coord_flip()
## ------------------------------------------------------------------------
for(i in 1:m) {
x <- matrix(rt(n1 * p,df = 1), ncol = p)# t distribution
y <- matrix(rnorm(n2 * p,sd = sample(c(1,1.3),size = n2*p, prob = c(0.5,0.5),replace = T)), ncol = p)#bimodel distribution
z <- rbind(x, y)
p.values[i, 1] <- eqdist.nn(z, N, k)$p.value
p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value
p.values[i, 3] <- bd.test(x = x, y = y, R = 999, seed = i)$p.value
}
alpha <- 0.01
pow <- apply(p.values<alpha,2,mean)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+
geom_col(fill = 'mediumpurple')+
coord_flip()
## ------------------------------------------------------------------------
n1 <- 50
n2 <- 5
n <- n1+n2
N = c(n1,n2)
for(i in 1:m) {
x <- matrix(rnorm(n1*p,mean = 1), ncol = p)#100 samples
y <- matrix(rnorm(n2*p,mean = 2), ncol = 2)#10 samples
z <- rbind(x, y)
p.values[i, 1] <- eqdist.nn(z, N, k)$p.value
p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value
p.values[i, 3] <- bd.test(x = x, y = y, R = 999, seed = i)$p.value
}
alpha <- 0.1
pow <- apply(p.values<alpha,2,mean)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+
geom_col(fill = 'sienna2')+
coord_flip()
## ----ex9.3,message=F-----------------------------------------------------
library(tidyverse)
set.seed(1)
m <- 20000#the number of samples
x <- numeric(m)#to store the samples
x[1] <- runif(1)#the first sample
u <- runif(m)
standard_Cauchy <- function(x){
return(1/(pi*(1+x^2)))
}
for (i in 1:m) {
proposal<-x[i]+runif(1, min=-1, max=1)
accept<-runif(1)<standard_Cauchy(proposal)/standard_Cauchy(x[i])
x[i+1]<-ifelse(accept==T, proposal, x[i])
}
x <- x[1001:m]
index <- 1001:m
quantile(x,probs = seq(0.1,0.9,0.1))
qcauchy(seq(0.1,0.9,0.1),loc = 0,scale=1)
x_tibble <- tibble(index,x)
ggplot(x_tibble,aes(index,x))+
geom_point(alpha = 5/10,color = 'lightskyblue3')
k <- seq(-10,10,0.02)
hist(x, breaks="scott", main="", xlab="", freq=FALSE)
lines(k,dcauchy(k))
## ----ex9.6,message=F-----------------------------------------------------
set.seed(1)
m <- 5000#length of the chain
w <- 0.25#width of the uniform support set
u <- runif(m)#for accept/reject step
v <- runif(m, -w, w)#proposal distribution
group <- c(125,18,20,34)
x <- numeric(m) #the chain
prob <- function(theta,group){
if(theta<0||theta>=0.8)
return(0)
return((1/2+theta/4)^group[1]*((1-theta)/4)^group[2]*((1-theta)/4)^group[3]*(theta/4)^group[4])
}
x[1] <- 0.4#initialize form 0.4
for (i in 2:m){
theta <- x[i-1]+v[i]
if (u[i] <= prob(theta, group) / prob(x[i-1], group))
x[i] <- theta
else
x[i] <- x[i-1]
}
index <- 1001:m#discard the first thousand sample
theta_hat <- mean(x[index])
print(theta_hat)
x_tibble <- tibble(index = 1001:m,x=x[index])
ggplot(x_tibble,aes(index,x))+
geom_point(alpha = 7/10,color = 'slateblue2')
## ------------------------------------------------------------------------
set.seed(1)
group <- c(125,18,20,34)
k <- 4#number of chains to generate
N <- 5000#length of the chain
b <- 500#burn-in length
Gelman.Rubin <- function(psi) {
psi <- as.matrix(psi)
n <- ncol(psi)
k <- nrow(psi)
psi.means <- rowMeans(psi)#row means
B <- n * var(psi.means)#between variance est.
psi.w <- apply(psi, 1, "var")#within variances
W <- mean(psi.w)#within est.
v.hat <- W * (n - 1) / n + (B / n)#upper variance est.
r.hat <- v.hat / W#G-R statistic
return(r.hat)
}
prob <- function(theta,group){
if(theta<0||theta>=0.9)
return(0)
return((1/2+theta/4)^group[1]*((1-theta)/4)^group[2]*((1-theta)/4)^group[3]*(theta/4)^group[4])
}
Chain <- function(group,N, X1) {
x <- numeric(N)
x[1] <- X1#initialize x[1]
w <- 0.25
u <- runif(N)
v <- runif(m, -w, w)
for (i in 2:N) {
theta <- x[i - 1] + v[i]
if (u[i] <= prob(theta, group) / prob(x[i - 1], group))
x[i] <- theta
else
x[i] <- x[i - 1]
}
return(x)
}
#generate the chains
x0 <- c(0.2, 0.4, 0.6, 0.8)
X <- matrix(0, nrow=k, ncol=N)
for (i in 1:k) X[i, ] <- Chain(group, N, x0[i])
#compute diagnostic statistics
psi <- t(apply(X, 1, cumsum))
for (i in 1:nrow(psi)) psi[i, ] <- psi[i, ] / (1:ncol(psi))
print(Gelman.Rubin(psi))
#plot psi for the four chains
for (i in 1:k)
plot(psi[i, (b+1):N], type="l",
xlab=i, ylab=bquote(psi))
par(mfrow=c(1,1)) #restore default
# plot the sequence of R-hat statistics
rhat <- rep(0, N)
for (j in (b+1):N) rhat[j] <- Gelman.Rubin(psi[,1:j])
plot(rhat[(b+1):N], type="l", xlab="", ylab="R")
abline(h=1.2, lty=2)
N <- b+which.min(rhat<=1.2)
X <- Chain(group,N, 0.6)
x_tibble <- tibble(index = 1:N,X)
## ----eval = F------------------------------------------------------------
# which.min(rhat<=1.2)
## ------------------------------------------------------------------------
set.seed(1)
eps <- .Machine$double.eps^0.25#criterion to judge whether function is near to 0
k <- c(4:25,100,500,1000)#k mentioned in the question
S <- function(k,a){
return((1-pt(sqrt((a^2*k)/(k+1-a^2)),df = k))-(1-pt(sqrt((a^2*(k-1))/(k-a^2)),df=k-1)))
}#S_k(a)-S_{k-1}(a)
Root <- function(k1){
a <- seq(0.1, sqrt(k1)-0.1,length = 3)
y <- c(S(k1,a[1]), S(k1,a[2]), S(k1,a[3]))
while(abs(y[2]) > eps) {
if (y[1] * y[2] < 0) {
a[3] <- a[2]
y[3] <- y[2]
} else {
a[1] <- a[2]
y[1] <- y[2]
}
a[2] <- (a[1] + a[3]) / 2
y[2] <- S(k1,a[2])
}
result <-list(k1,a[2],y[2])
return(result)
}
for(i in k){#print the output of each k
cat('k:',Root(i)[[1]],'root:',Root(i)[[2]],'value of function:',Root(i)[[3]],'\n')
}
## ------------------------------------------------------------------------
k <- 4
a <- seq(0.1,sqrt(k)-0.1,0.01)
y0 <- numeric(length(a))
y <- (1-pt(sqrt((a^2*k)/(k+1-a^2)),df = k))-(1-pt(sqrt((a^2*(k-1))/(k-a^2)),df=k-1))
plot(a,y,'l')
lines(a,y0)
cat('The root of curves when k = ',k,'is',Root(k)[[2]],'\n')
k <- 10
a <- seq(0.1,sqrt(k)-0.1,0.01)
y0 <- numeric(length(a))
y <- (1-pt(sqrt((a^2*k)/(k+1-a^2)),df = k))-(1-pt(sqrt((a^2*(k-1))/(k-a^2)),df=k-1))
plot(a,y,'l')
lines(a,y0)
cat('The root of curves when k = ',k,'is',Root(k)[[2]],'\n')
## ----message=F-----------------------------------------------------------
set.seed(1)
library(functional)
library(tidyverse)
density_Cauchy <- function(theta,eta,x){
1/(theta*pi*(1+((x-eta)/theta)^2))
}
cdf_Cauchy <- function(a,theta,eta) integrate(Curry(density_Cauchy,theta,eta),-Inf,a)$value
p_cauchy <- function(a,theta,eta) pcauchy(a,location = eta,scale = theta)
#for example we set a = 3, theta = 2, eta = 2
a <- 3
theta <- 2
eta <- 2
print(cdf_Cauchy(a,theta,eta))
print(p_cauchy(a,theta,eta))
#for example we set a = 0, theta = 5, eta = 1
a <- 0
theta <- 5
eta <- 1
print(cdf_Cauchy(a,theta,eta))
print(p_cauchy(a,theta,eta))
## ------------------------------------------------------------------------
x <- seq(-10,10,1)
theta <- 2
eta <- 3
m <- 1
y1 <- numeric(length(x))
y2 <- numeric(length(x))
for(i in x){
y1[m] <- cdf_Cauchy(i,theta,eta)
y2[m] <- p_cauchy(i,theta,eta)
m <- m+1
}
cauchy <- tibble(x=c(x,x),y=c(y1,y2),method=rep(c('integral','pcauchy'),each = 21))
ggplot(cauchy,aes(x,y))+
geom_col(aes(fill = method),position = 'dodge')
## ------------------------------------------------------------------------
N=1000
na=28
nb=24
noo=41
nab=70
p=0.3 #initial est. for p
q=0.3
r=0.3
pm=numeric(0)
qm=numeric(0)
rm=numeric(0)
lofm=numeric(0)
lof=function(p,q,r){ #log maximum likelihood values
return(log(choose(n,naa)*choose(n-naa,nbb)*choose(n-naa-nbb,noo)*choose(nao+nbo+nab,nao)*
choose(nbo+nab,nbo))+
(nao+nbo+nab)*log(2)+
(2*naa+nao+nab)*log(p)+(2*nbb+nbo+nab)*log(q)+(2*noo+nao+nbo)*log(r))
}
for (j in 1:N){
naa=round(na*p^2/(p^2+2*p*r))
nbb=round(nb*q^2/(q^2+2*q*r))
nao=na-naa
nbo=nb-nbb
n=naa+nbb+noo+nao+nbo+nab
if(abs(p-(2*naa+nao+nab)/2/n)<1e-8&&abs(q-(2*nbb+nbo+nab)/2/n)<1e-8&&abs(r-(2*noo+nbo+nao)/2/n)<1e-8&&j>5) {print(j);break}
p=(2*naa+nao+nab)/2/n #update estimation
q=(2*nbb+nbo+nab)/2/n
r=(2*noo+nbo+nao)/2/n
pm=c(pm,p)
qm=c(qm,q)
rm=c(rm,r)
lofm=c(lofm,lof(p,q,r))
}
print(c(p,q,r))
print(exp(lofm))
## ------------------------------------------------------------------------
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)
## ------------------------------------------------------------------------
set.seed(1)
n <- length(formulas)
attach(mtcars)
# loop
for (i in 1:n){
print(lm(formulas[[i]]))
}
# lapply
lapply(formulas, lm)
detach()
## ------------------------------------------------------------------------
bootstraps <- lapply(1:10, function(i) {
rows <- sample(1:nrow(mtcars), rep = TRUE)
mtcars[rows, ]
})
## ------------------------------------------------------------------------
# loop
for(i in 1:10){
print(lm(bootstraps[[i]]$mpg~bootstraps[[i]]$disp))
}
# lapply
for(i in 1:10){
bootstraps[[i]] <- subset(bootstraps[[i]],select=c(mpg,disp))
}
lapply(bootstraps,lm)
## ----eval=F--------------------------------------------------------------
# rsq <- function(mod) summary(mod)$r.squared
## ------------------------------------------------------------------------
# Exercise 3
attach(mtcars)
# loop
for (i in 1:n){
print(summary(lm(formulas[[i]]))$r.squared)
}
# lapply
lapply(formulas,function(x){
summary(lm(x))$r.squared
})
detach()
# Exercise 4
# loop
for(i in 1:10){
print(summary(lm(bootstraps[[i]]$mpg~bootstraps[[i]]$disp))$r.squared)
}
# lapply
lapply(bootstraps,function(x){
summary(lm(x$mpg~x$disp))$r.squared})
## ------------------------------------------------------------------------
trials <- replicate(
100,
t.test(rpois(10, 10), rpois(7, 10)),
simplify = FALSE
)
## ------------------------------------------------------------------------
sapply(trials,function(x){
return(x$p.value)
})
## ------------------------------------------------------------------------
data <- matrix(rnorm(20, 0, 10), nrow = 4)
x <- as.data.frame(data)
answer1 <- Map("/",x,vapply(x,mean,c(1)))
answer2 <- lapply(x,function(data){data/(mean(data))})
print(data.frame(unlist(answer1),unlist(answer2)))
## ------------------------------------------------------------------------
chisq.test_simp <- function(x, y = NULL, correct = T,p = rep(1/length(x),length(x)),rescale.p = F, simulate.p.value = F, B = 2000)
{
DNAME <- deparse(substitute(x))
if (is.data.frame(x))
x <- as.matrix(x)
if (is.matrix(x)) {
if (min(dim(x)) == 1L)
x <- as.vector(x)
}
if (!is.matrix(x) && !is.null(y)) {
if (length(x) != length(y))
stop("'x' and 'y' must have the same length")
DNAME2 <- deparse(substitute(y))
xname <- if (length(DNAME) > 1L || nchar(DNAME, "w") >
30)
""
else DNAME
yname <- if (length(DNAME2) > 1L || nchar(DNAME2, "w") >
30)
""
else DNAME2
OK <- complete.cases(x, y)
x <- factor(x[OK])
y <- factor(y[OK])
if ((nlevels(x) < 2L) || (nlevels(y) < 2L))
stop("'x' and 'y' must have at least 2 levels")
x <- table(x, y)
names(dimnames(x)) <- c(xname, yname)
DNAME <- paste(paste(DNAME, collapse = "\n"), "and",
paste(DNAME2, collapse = "\n"))
}
if (any(x < 0) || anyNA(x))
stop("all entries of 'x' must be nonnegative and finite")
if ((n <- sum(x)) == 0)
stop("at least one entry of 'x' must be positive")
if (simulate.p.value) {
setMETH <- function() METHOD <<- paste(METHOD, "with simulated p-value\n\t (based on",
B, "replicates)")
almost.1 <- 1 - 64 * .Machine$double.eps
}
if (is.matrix(x)) {
METHOD <- "Pearson's Chi-squared test"
nr <- as.integer(nrow(x))
nc <- as.integer(ncol(x))
if (is.na(nr) || is.na(nc) || is.na(nr * nc))
stop("invalid nrow(x) or ncol(x)", domain = NA)
sr <- rowSums(x)
sc <- colSums(x)
E <- outer(sr, sc, "*")/n
v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3
V <- outer(sr, sc, v, n)
dimnames(E) <- dimnames(x)
if (simulate.p.value && all(sr > 0) && all(sc > 0)) {
setMETH()
tmp <- .Call(C_chisq_sim, sr, sc, B, E)
STATISTIC <- sum(sort((x - E)^2/E, decreasing = TRUE))
PARAMETER <- NA
PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC))/(B +
1)
}
else {
if (simulate.p.value)
warning("cannot compute simulated p-value with zero marginals")
if (correct && nrow(x) == 2L && ncol(x) == 2L) {
YATES <- min(0.5, abs(x - E))
if (YATES > 0)
METHOD <- paste(METHOD, "with Yates' continuity correction")
}
else YATES <- 0
STATISTIC <- sum((abs(x - E) - YATES)^2/E)
PARAMETER <- (nr - 1L) * (nc - 1L)
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
}
}
else {
if (length(dim(x)) > 2L)
stop("invalid 'x'")
if (length(x) == 1L)
stop("'x' must at least have 2 elements")
if (length(x) != length(p))
stop("'x' and 'p' must have the same number of elements")
if (any(p < 0))
stop("probabilities must be non-negative.")
if (abs(sum(p) - 1) > sqrt(.Machine$double.eps)) {
if (rescale.p)
p <- p/sum(p)
else stop("probabilities must sum to 1.")
}
METHOD <- "Chi-squared test for given probabilities"
E <- n * p
V <- n * p * (1 - p)
STATISTIC <- sum((x - E)^2/E)
names(E) <- names(x)
if (simulate.p.value) {
setMETH()
nx <- length(x)
sm <- matrix(sample.int(nx, B * n, TRUE, prob = p),
nrow = n)
ss <- apply(sm, 2L, function(x, E, k) {
sum((table(factor(x, levels = 1L:k)) - E)^2/E)
}, E = E, k = nx)
PARAMETER <- NA
PVAL <- (1 + sum(ss >= almost.1 * STATISTIC))/(B +
1)
}
else {
PARAMETER <- length(x) - 1
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
}
}
names(STATISTIC) <- "X-squared"
names(PARAMETER) <- "df"
if (any(E < 5) && is.finite(PARAMETER))
warning("Chi-squared approximation may be incorrect")
return(PVAL)
}
x <- c(100,200,150,172,134)
y <- c(100,104,103,98)
chisq.test_simp(x)
chisq.test_simp(y)
## ------------------------------------------------------------------------
table2 <- function(x, y) {
x_val <- unique(x)
y_val <- unique(y)
mat <- matrix(0L, length(x_val), length(y_val))
for (i in seq_along(x)) {
mat[which(x_val == x[[i]]), which(y_val == y[[i]])] <-
mat[which(x_val == x[[i]]), which(y_val == y[[i]])] + 1L
}
dimnames <- list(x_val, y_val)
names(dimnames) <- as.character(as.list(match.call())[-1])
tab <- array(mat, dim = dim(mat), dimnames = dimnames)
class(tab) <- "table"
tab
}
a <- c(1, 4, 2, 2, 1, 4)
b <- c(2, 3, 4, 2, 3, 4)
table2(a,b)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.