StatComp18018: 18018's final project for 'Statistical Computing' course

## ------------------------------------------------------------------------
set.seed(10)
x <- rnorm(10,3,2)
print(x)

## ------------------------------------------------------------------------
a <- "Double quotes \" delimitate R¡¯s strings."
print(a)

## ------------------------------------------------------------------------
set.seed(10)
y<-rnorm(1000,2,3)
hist(y,breaks = 30)

## ------------------------------------------------------------------------
library(ggplot2)
set.seed(123)#guarantee that the program can be repeated with the same result
x <- 0:4#define r.v. x
p <- c(0.1,0.2,0.2,0.2,0.3)#the relative probability of x
cp <- cumsum(p)#cumulative sum of probability
m <- 1000#size of samples
r <- numeric(m)#construct a zero vector with length of 1000 
r <- x[findInterval(runif(m),cp)+1]
# Generate 1000 sample from a uniform distribution and find where does every sample fall in the interval of cp.Every interval corresponds a value of random variable x.
table_r <- table(r)
print(table_r)

#theoretical probability
t_p <- p*1000
names(t_p) <- x
print(t_p)#show the theoretical 1000 samples
a <- data.frame(x,freq = c(105,193,205,200,297,100,200,200,200,300),type = rep(c('random sample','theoretical sample'),each = 5))
ggplot(a,aes(x = x ,y =freq ,fill = type))+geom_col(position = 'dodge')

## ------------------------------------------------------------------------
set.seed(300)
library(ggplot2)
beta_arm <- function(n,a,b){
  beta_pdf <- function(x){
    (1/beta(a,b))*(x)^(a-1)*(1-x)^(b-1)#define the pdf of beta distribution
  }
  j <- 0#the time of iteration
  k <- 0#the available sample 
  y <- numeric(n)#a zero vector of length n
  while (k < n){
    u <- runif(1) 
    j <- j + 1 
    x <- runif(1) #random variate from g 
    if ((a-1)*x^(a-2) * (b-1)*(1-x)^(b-2) > u) { 
      #accept x 
      k <- k + 1 
      y[k] <- x
    }
  }
  return(y)
}
sample_beta <- beta_arm(1000,3,2)#record the sample
head(sample_beta,20)
sample_beta_theo <- rbeta(1000,3,2)#generate the sample by using the function "rbeta"
data_beta <- data.frame(sample = c(sample_beta,sample_beta_theo),class = rep(c('empirical','theoretical'),each = 1000))#construst a data frame in order to generate a figure
ggplot(data_beta,aes(x = sample,fill = class))+geom_histogram(position="identity", alpha=0.5)

## ------------------------------------------------------------------------
set.seed(100)
library(ggplot2)
n <- 1000
r <- 4
beta <- 2#define the parameters
lambda <- rgamma(n, r, beta)#generate the random sample from the gamma distribution with r=4, beta=2
x <- rexp(n, lambda) 
head(x,30)#list the first 30 samples
x_data <- data.frame(x1 = x)
ggplot(x_data,aes(x = x1))+geom_histogram(fill = 'skyblue',alpha =0.7,binwidth = 0.4)

## ----ex5.4---------------------------------------------------------------
set.seed(111)
library(ggplot2)
beta33_cdf <- function(n,a,b){
  x <- runif(n,a,b)
  cdf_est <- (sum(30*(x^2-2*x^3+x^4))/n)*(b-a)#Monta Carlo method
  return(cdf_est)
}
a <- data.frame(x = seq(0.1,0.9,0.1), MontaCarlo = numeric(9),pbeta = numeric(9))#construct a data frame to show the value of cdf
i <- 1
while(i<=9){
  a[i,2] <- beta33_cdf(10000,0,i*0.1)
  a[i,3] <- pbeta(i*0.1,3,3)
  i <- i+1
}
print(a)

b <- data.frame(x = c(a$x,a$x),pdf = c(a$MontaCarlo,a$pbeta),class = rep(c('MontaCarlo','pbeta'),each=9))
#contruct b in order to use ggplot2 easily
ggplot(data = b)+
  geom_col(aes(x = x,y = pdf, fill = class),position = 'dodge')+scale_fill_brewer(palette = 'Set1')


## ----ex5.9---------------------------------------------------------------
set.seed(100)
Rayleigh <- function(n,sigma,ant = T){
  m <- floor(n/2)
  u <- runif(m)
  if(ant) v <- 1-u else v <- runif(m)
  u <- c(u,v)
  sample <- numeric(length(u))
  sample <- (-(2*sigma^2)*(log(1-u)))^(1/2)
  return(sample)

}
sample <- Rayleigh(1000,4)
head(sample,20)#out of simplicity, only 20 samples are printed

## ----variance reduction--------------------------------------------------
set.seed(100)
library(ggplot2)
idpt_sample <- Rayleigh(4000,2,ant=F)#use Rayleigh function to obtain 4000 independent samples
ant_sample <- Rayleigh(4000,2,ant=T)#use Rayleigh function to obtain 4000 antithetic samples
var_idpt_sample <- var((idpt_sample[1:2000]+idpt_sample[2001:4000])/2)#compute the variance of independent samples
var_ant_sample <- var((ant_sample[1:2000]+ant_sample[2001:4000])/2)#compute the variance of antithetic samples
cat('The variance of independent sample is',var_idpt_sample,'.\n
The variance of ant_sample is',var_ant_sample,'.\n
The pecent of variance reduction is', (var_idpt_sample-var_ant_sample)/var_idpt_sample,
'.\n\nThe covariance of independent sample is', cov(idpt_sample[1:2000],idpt_sample[2001:4000]),', while the covariance of antithetic sample is', (cov(ant_sample[1:2000],ant_sample[2001:4000])))#show the variances
a <- data.frame(class <- c('independent sample','antithetic sample'),
                variance <- c(var_idpt_sample,var_ant_sample))
ggplot(a)+geom_col(aes(x = class,y = variance),fill = 'LightSkyBlue3')+ylim(0,1)+
  coord_flip()#use figure to compare the variance reduction


## ----ex5.13--------------------------------------------------------------
set.seed(100)
n <- 10000
theta.hat <- numeric(2)
se <- numeric(2)
g <- function(x){
  x^2*exp(-(x^2)/2)/(2*pi)^(1/2)*(x>=1)
}

x <- rexp(n,1)
fg <- g(x)/exp(-x)
theta.hat[1] <- mean(fg)
se[1] <- sd(fg)

x <- Rayleigh(n,1,ant = F)
fg <- g(x)/(x*exp(-(x^2)/2))
theta.hat[2] <- mean(fg)
se[2] <- sd(fg)

integrate(g,1,Inf)

a <- data.frame(theta.hat,se)
print(a)



## ----ex5.14--------------------------------------------------------------
set.seed(100)
n <- 10000
g <- function(x){
  x^2*exp(-(x^2)/2)/(2*pi)^(1/2)*(x>=1)
}

x <- Rayleigh(n,1,ant = F)#density of g(x)/f(x)
fg <- g(x)/(x*exp(-(x^2)/2))#g(x)/f(x)
cat(mean(fg),'\n')#estimate the integral

integrate(g,1,Inf)

## ----ex6.9 lognormal,message=F-------------------------------------------
library(tidyverse)
set.seed(100)
m <- n <- 1000

G_lnorm <- function(n){#a function to compute G-hat
  x <- rlnorm(n)
  x <- sort(x)#generate order statistics
  sum_x <- 0
  for(i in 1:n) sum_x <- sum_x+(2*i-n-1)*x[i]
  sum_x <- sum_x/(n^2*mean(x))
  return(sum_x)
}

G1 <- numeric(m)#m estimations of G

for(j in 1:m)  G1[j] <- G_lnorm(n)

cat('The mean of G-hat is',mean(G1),',the median of G-hat is',median(G1),',and the deciles are',quantile(G1,probs = seq(0,1,0.1)))

G1 <- tibble(G1)

ggplot(G1,aes(x = G1))+geom_histogram(fill = 'royalblue',alpha = 0.7)


## ----ex6.9 uniform,message=F---------------------------------------------
G_unif <- function(n){#a function to compute G-hat
  x <- runif(n)
  x <- sort(x)#generate order statistics
  sum_x <- 0
  for(i in 1:n) sum_x <- sum_x+(2*i-n-1)*x[i]
  sum_x <- sum_x/(n^2*mean(x))
  return(sum_x)
}

G2 <- numeric(m)#m estimations of G

for(j in 1:m)  G2[j] <- G_unif(n)

cat('The mean of G-hat is',mean(G2),',the median of G-hat is',median(G2),',and the deciles are',quantile(G2,probs = seq(0,1,0.1)))

G2 <- tibble(G2)

ggplot(G2,aes(x = G2))+geom_histogram(fill = "CadetBlue3",alpha = 0.7)


## ----ex6.9 Bernoulli,message=F-------------------------------------------
G_bernoulli <- function(n){#a function to compute G-hat
  x <- rbernoulli(n)
  x <- sort(x)#generate order statistics
  sum_x <- 0
  for(i in 1:n) sum_x <- sum_x+(2*i-n-1)*x[i]
  sum_x <- sum_x/(n^2*mean(x))
  return(sum_x)
}

G3 <- numeric(m)#m estimations of G

for(j in 1:m)  G3[j] <- G_bernoulli(n)

cat('The mean of G-hat is',mean(G3),',the median of G-hat is',median(G3),',and the deciles are',quantile(G3,probs = seq(0,1,0.1)))

G3 <- tibble(G3)

ggplot(G3,aes(x = G3))+geom_histogram(fill = 'yellow2',alpha = 0.7)

## ----ex6.10--------------------------------------------------------------
set.seed(100)
n <- 200#the size of x to generate G
m <- 100#the size of G to compute mean of G and sd of G
k <- 200#the size of gamma to estimate the coverage rate
G4 <- numeric(m)

for(i in 1:m)  G4[i] <- G_lnorm(n)

gamma_mean <- mean(G4)
gamma_sd <- sd(G4)

#confidence interval (a,b)
a <- gamma_mean-qt(0.975,df = m-1)*gamma_sd/sqrt(m)
b <- gamma_mean+qt(0.975,df = m-1)*gamma_sd/sqrt(m)

gamma <- numeric(k)

#generate another 300 gammas to find whether the confidence interval is valid
for(i in 1:k){
  G <- numeric(m)
  for(j in 1:m) G[j] <- G_lnorm(n)
  gamma[i] <- mean(G)
}

cat('The confidence interval of gamma is (',round(a,4),',',round(b,4),'), and the coverage rate is about ',round((sum(gamma>a&gamma<b)/k),3)*100,'%',sep = '')



## ----ex6.B---------------------------------------------------------------
library(MASS)
set.seed(14)
n <- 30
m <- 5000
pearson_p <- numeric(m)
kendall_p <- numeric(m)
spearman_p <- numeric(m)

sigma <- matrix(c(1,0.3,0.3,1),ncol = 2)
mean <- c(0,1)

#MC
for(i in 1:m){
mltinorm <- mvrnorm(n,mean,sigma)
x <- mltinorm [, 1]
y <- mltinorm [, 2]
#using three methods to implement tests for association
pearson <- cor.test(x,y,method = 'pearson')
kendall <- cor.test(x,y,method = 'kendall')
spearman <- cor.test(x,y,method = 'spearman')

pearson_p[i] <- pearson$p.value
kendall_p[i] <- kendall$p.value
spearman_p[i] <- spearman$p.value

}

#compute the rate that null hypothesis is rejected
cat('pearson:',sum(pearson_p<0.05)/m,
    'kendall:',sum(kendall_p<0.05)/m,
    'spearman:',sum(spearman_p<0.05)/m)

rate <- tibble(methods = c('pearson','kendall','spearman'),reject_percent = c(sum(pearson_p<0.05)/m,sum(kendall_p<0.05)/m,sum(spearman_p<0.05)/m))
rate$methods <- factor(rate$methods,levels = c('pearson','kendall','spearman'))
ggplot(rate,aes(x = methods,y = reject_percent))+
  geom_col(fill = 'Salmon')

## ----ex6.B-2-------------------------------------------------------------
set.seed(6809)
n <- 30
m <- 5000
pearson_p <- numeric(m)
kendall_p <- numeric(m)
spearman_p <- numeric(m)

sigma <- matrix(c(1,0,0,1),ncol = 2)
mean <- c(0,1)
#MC
for(i in 1:m){
  mltinorm <- mvrnorm(n, mean, sigma)
  x <- mltinorm [, 1]
  y <- mltinorm [, 2]
  
#using three methods to implement tests for association
  pearson <- cor.test(x, y, method = 'pearson')
  kendall <- cor.test(x, y, method = 'kendall')
  spearman <- cor.test(x, y, method = 'spearman')
  
  pearson_p[i] <- pearson$p.value
  kendall_p[i] <- kendall$p.value
  spearman_p[i] <- spearman$p.value
  
}

#compute the times that alternative hypothesis is accepted
cat('pearson:',sum(pearson_p<0.05)/m,
    'kendall:',sum(kendall_p<0.05)/m,
    'spearman:',sum(spearman_p<0.05)/m)
rate <- tibble(methods = c('pearson','kendall','spearman'),reject_percent = c(sum(pearson_p<0.05)/m,sum(kendall_p<0.05)/m,sum(spearman_p<0.05)/m))
rate$methods <- factor(rate$methods,levels = c('pearson','kendall','spearman'))
ggplot(rate,aes(x = methods,y = reject_percent))+
  geom_col(fill = 'NavajoWhite2')

## ----exercise 7.1--------------------------------------------------------
set.seed(1)
library(bootstrap)
data(law)

n <- nrow(law)
cor_law <- cor(law$LSAT, law$GPA)
cor_law_jack <- numeric(n)
for (i in 1:n)
  cor_law_jack[i] <- cor(law$LSAT[-i], law$GPA[-i])

#bias
bias_cor_law <- (n - 1) * (mean(cor_law_jack) - cor_law)
cat(
  'The bias of the correlation statistic computed by jackknife estimation is',
  bias_cor_law,
  '\n'
)
#standard error
se_cor_law <- sqrt((n - 1) * mean((cor_law_jack - mean(cor_law_jack)) ^ 2))
cat(
  'The standard error of the correlation statistic computed by jackknife estimation is',
  se_cor_law
)


## ----exercise 7.5,warning=F----------------------------------------------
library(boot)
data("aircondit")
mean_airc <- function(x, i) {
  a <- x[i, 1]
  return(mean(a))
}
boot.obj <- boot(aircondit, statistic = mean_airc, R = 300)
boot_ci <-
  boot.ci(boot.obj, type = c("basic", "norm", "perc", "bca"))
print(boot_ci)

## ----exercise 7.8--------------------------------------------------------
data(scor)
scor_pca <- princomp(scor, cor = F)
scor_pca_summary <- summary(scor_pca)
#the proportion of variance explained by the first principal component is 0.619115
theta.hat <- 0.619115

#jackknife
n <- nrow(scor)
theta.hat.jackknife <- numeric(n)
for (i in 1:n) {
scor_jackknife <- scor[-i, ]
scor_jackknife_cov <- cov(scor_jackknife)
theta.hat.jackknife[i] <-
eigen(scor_jackknife_cov)$value[1] / sum(eigen(scor_jackknife_cov)$value)
}

#bias
bias_theta <- (n - 1) * (mean(theta.hat.jackknife) - theta.hat)
cat(
  'The bias of the theta hat computed by jackknife estimation is',
  bias_theta,
  '\n'
)
#standard error
se_theta <-
  sqrt((n - 1) * mean((theta.hat.jackknife - mean(theta.hat.jackknife)) ^ 2))
cat(
  'The standard error of the theta hat computed by jackknife estimation is',
  se_cor_law
)

## ----exercise 7.11,message=F---------------------------------------------
library(DAAG)
library(tidyverse)
attach(ironslag)

L1 <-  lm(magnetic ~ chemical)
L2 <-  lm(magnetic ~ chemical + I(chemical ^ 2))
L3 <-  lm(log(magnetic) ~ chemical)
L4 <-  lm(log(magnetic) ~ log(chemical))

n <- length(ironslag$magnetic) / 2
e1 <- e2 <- e3 <- e4 <- numeric(n * 2)

m <- 
##leave-two-out
for (k in (1:n)*2) {
  y <- magnetic[-c(k-1,k)] 
  x <- chemical[-c(k-1,k)]
  
  J1 <- lm(y ~ x) 
  yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k-1]
  e1[k-1] <- magnetic[k-1] - yhat1#error between estimation and observation
  yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k]
  e1[k] <- magnetic[k] - yhat1#error between estimation and observation
  
  J2 <- lm(y ~ x + I(x^2)) 
  yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k-1] + J2$coef[3] * chemical[k-1]^2
  e2[k-1] <- magnetic[k-1] - yhat2#error between estimation and observation
  yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] + J2$coef[3] * chemical[k]^2
  e2[k] <- magnetic[k] - yhat2#error between estimation and observation

  J3 <- lm(log(y) ~ x) 
  logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k-1] 
  yhat3 <- exp(logyhat3)
  e3[k-1] <- magnetic[k-1] - yhat3#error between estimation and observation
  logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k] 
  yhat3 <- exp(logyhat3)
  e3[k] <- magnetic[k] - yhat3#error between estimation and observation
  
  J4 <- lm(log(y) ~ log(x)) 
  logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k-1]) 
  yhat4 <- exp(logyhat4)
  e4[k-1] <- magnetic[k-1] - yhat4#error between estimation and observation
  logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k]) 
  yhat4 <- exp(logyhat4)
  e4[k] <- magnetic[k] - yhat4#error between estimation and observation
}

detach()

print(c(mean(e1 ^ 2), mean(e2 ^ 2), mean(e3 ^ 2), mean(e4 ^ 2)))
#choose the model with the least average error

error <- data.frame(model = paste(c('L'),1:4,sep = ''),error = c(mean(e1 ^ 2), mean(e2 ^ 2), mean(e3 ^ 2), mean(e4 ^ 2)))
ggplot(error,aes(x=model,y = error))+
  geom_col(fill = 'wheat2')+
  xlab('errors of four models')

## ----exercise8.1, message=F----------------------------------------------
set.seed(1)
library("nortest")

attach(chickwts)
x <- sort(as.vector(weight[feed == "soybean"]))
y <- sort(as.vector(weight[feed == "linseed"]))
detach(chickwts)

z <- c(x,y)#pooled sample
R <- 999#number of replicates
K <- 1:length(z)
D <- numeric(R)#used to store statistics

CM <- function(x,y){#a function to compute the statistic of Cramer-von Mises
  ecdfx <- ecdf(x)
  ecdfy <- ecdf(y)
  l_x <- length(x)
  l_y <- length(y)
  
  sum1 <- sum((ecdfx(x)-ecdfy(x))^2)
  sum2 <- sum((ecdfx(y)-ecdfy(y))^2)
  
  w <- l_x*l_y/(l_x+l_y)^2*(sum1+sum2)
  return(w)
}

D0 <- CM(x,y)

for(i in 1:R) {
  k <- sample(K, size = length(x), replace = F)
  x1 <- z[k]
  y1 <- z[-k]
  D[i] <- CM(x1,y1)
}

p <- mean(c(D0, D) >= D0)

print(p)

hist(D, main = "", freq = FALSE, xlab = "D (p = 0.401)", breaks = "scott")
points(D0, 0, cex = 1, pch = 16)


## ----message=F-----------------------------------------------------------
library(RANN)
library(boot)
library(energy)
library(Ball)
library(ggplot2)
library(survival)

m <- 30#times to loop
k<-3
p<-2#ncol
# mu <- 0.5
n1 <- n2 <- 50#nrow
R<-999#the number of replications in the bd.test function
n <- n1+n2
N = c(n1,n2)

Tn <- function(z, ix, sizes,k) {
  n1 <- sizes[1]
  n2 <- sizes[2]
  n <- n1 + n2
  if(is.vector(z)) z <- data.frame(z,0)
  z <- z[ix, ]
  NN <- nn2(data=z, k=k+1)
  block1 <- NN$nn.idx[1:n1,-1] 
  block2 <- NN$nn.idx[(n1+1):n,-1] 
  i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5) 
  (i1 + i2) / (k * n)
}


eqdist.nn <- function(z,sizes,k){#NN
  boot.obj <- boot(data=z,statistic=Tn,R=R,
  sim = "permutation", sizes = sizes,k=k)
  ts <- c(boot.obj$t0,boot.obj$t)
  p.value <- mean(ts>=ts[1])
  list(statistic=ts[1],p.value=p.value)
}

p.values <- matrix(NA,m,3)#to store p values

for(i in 1:m) {
  x <- matrix(rnorm(n1 * p, sd = 1), ncol = p)#unequal variances
  y <- matrix(rnorm(n2 * p, sd = 1.4), ncol = p)
  z <- rbind(x, y)
  p.values[i, 1] <- eqdist.nn(z, N, k)$p.value#NN
  p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value#in the energy package
  p.values[i, 3] <- bd.test(x = x, y = y, R = 999, seed = i)$p.value#"Ball Divergence" in the ball package
}

alpha <- 0.1#confidence level
pow <- apply(p.values<alpha,2,mean)#compute the number of p.values which is less than 0.1 in each column
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+#plot
  geom_col(fill = 'palegreen3')+
  coord_flip()

## ------------------------------------------------------------------------
for(i in 1:m) {
  x <- matrix(rnorm(n1 * p, mean = 0.4, sd = 1), ncol = p)#unequal variances and unequal expectations
  y <- matrix(rnorm(n2 * p, mean = 0, sd = 1.4), ncol = p)
  z <- rbind(x, y)
  p.values[i, 1] <- eqdist.nn(z, N, k)$p.value
  p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value
  p.values[i, 3] <- bd.test(x = x,  y = y,  R = 999,  seed = i)$p.value
}
alpha <- 0.1
pow <- apply(p.values<alpha,2,mean)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+
  geom_col(fill = 'lightslategrey')+
  coord_flip()

## ------------------------------------------------------------------------
for(i in 1:m) {
  x <- matrix(rt(n1 * p,df = 1), ncol = p)# t distribution
  y <- matrix(rnorm(n2 * p,sd = sample(c(1,1.3),size = n2*p, prob = c(0.5,0.5),replace = T)), ncol = p)#bimodel distribution
  z <- rbind(x, y)
  p.values[i, 1] <- eqdist.nn(z, N, k)$p.value
  p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value
  p.values[i, 3] <- bd.test(x = x, y = y, R = 999, seed = i)$p.value
}
alpha <- 0.01
pow <- apply(p.values<alpha,2,mean)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+
  geom_col(fill = 'mediumpurple')+
  coord_flip()

## ------------------------------------------------------------------------
n1 <- 50
n2 <- 5
n <- n1+n2
N = c(n1,n2)
for(i in 1:m) {
  x <- matrix(rnorm(n1*p,mean = 1), ncol = p)#100 samples
  y <- matrix(rnorm(n2*p,mean = 2), ncol = 2)#10 samples
  z <- rbind(x, y)
  p.values[i, 1] <- eqdist.nn(z, N, k)$p.value
  p.values[i, 2] <- eqdist.etest(z, sizes = N, R = R)$p.value
  p.values[i, 3] <- bd.test(x = x, y = y, R = 999, seed = i)$p.value
}
alpha <- 0.1
pow <- apply(p.values<alpha,2,mean)
print(pow)
power <- data.frame(methods = c('NN','energy','Ball'),pow)
ggplot(power,aes(methods,pow))+
  geom_col(fill = 'sienna2')+
  coord_flip()

## ----ex9.3,message=F-----------------------------------------------------
library(tidyverse)

set.seed(1)

m <- 20000#the number of samples
x <- numeric(m)#to store the samples
x[1] <- runif(1)#the first sample
u <- runif(m)


standard_Cauchy <- function(x){
  return(1/(pi*(1+x^2)))
}

for (i in 1:m) {
  proposal<-x[i]+runif(1, min=-1, max=1)
  accept<-runif(1)<standard_Cauchy(proposal)/standard_Cauchy(x[i])
  x[i+1]<-ifelse(accept==T, proposal, x[i])
}

x <- x[1001:m]
index <- 1001:m

quantile(x,probs = seq(0.1,0.9,0.1))
qcauchy(seq(0.1,0.9,0.1),loc = 0,scale=1)


x_tibble <- tibble(index,x)
ggplot(x_tibble,aes(index,x))+
  geom_point(alpha = 5/10,color = 'lightskyblue3')

k <- seq(-10,10,0.02)
hist(x, breaks="scott", main="", xlab="", freq=FALSE)
lines(k,dcauchy(k))

## ----ex9.6,message=F-----------------------------------------------------
set.seed(1)
m <- 5000#length of the chain
w <- 0.25#width of the uniform support set
u <- runif(m)#for accept/reject step
v <- runif(m, -w, w)#proposal distribution
group <- c(125,18,20,34)
x <- numeric(m) #the chain

prob <- function(theta,group){
  if(theta<0||theta>=0.8)
    return(0)
  return((1/2+theta/4)^group[1]*((1-theta)/4)^group[2]*((1-theta)/4)^group[3]*(theta/4)^group[4])
}

x[1] <- 0.4#initialize form 0.4
for (i in 2:m){
  theta <- x[i-1]+v[i]
  if (u[i] <= prob(theta, group) / prob(x[i-1], group))
    x[i] <- theta
  else 
    x[i] <- x[i-1]
}

index <- 1001:m#discard the first thousand sample

theta_hat <- mean(x[index])
print(theta_hat)

x_tibble <- tibble(index = 1001:m,x=x[index])
ggplot(x_tibble,aes(index,x))+
  geom_point(alpha = 7/10,color = 'slateblue2')


## ------------------------------------------------------------------------
set.seed(1)
group <- c(125,18,20,34)
k <- 4#number of chains to generate
N <- 5000#length of the chain
b <- 500#burn-in length

Gelman.Rubin <- function(psi) {
  psi <- as.matrix(psi)
  n <- ncol(psi)
  k <- nrow(psi)
  psi.means <- rowMeans(psi)#row means
  B <- n * var(psi.means)#between variance est.
  psi.w <- apply(psi, 1, "var")#within variances
  W <- mean(psi.w)#within est.
  v.hat <- W * (n - 1) / n + (B / n)#upper variance est.
  r.hat <- v.hat / W#G-R statistic
  return(r.hat)
}

prob <- function(theta,group){
  if(theta<0||theta>=0.9)
    return(0)
  return((1/2+theta/4)^group[1]*((1-theta)/4)^group[2]*((1-theta)/4)^group[3]*(theta/4)^group[4])
}


Chain <- function(group,N, X1) {
  x <- numeric(N)
  x[1] <- X1#initialize x[1]
  w <- 0.25
  u <- runif(N)
  v <- runif(m, -w, w)
  
  
  for (i in 2:N) {
    theta <- x[i - 1] + v[i]
    if (u[i] <= prob(theta, group) / prob(x[i - 1], group))
    x[i] <- theta
    else
    x[i] <- x[i - 1]
  }
  return(x)
}

#generate the chains
x0 <- c(0.2, 0.4, 0.6, 0.8)
X <- matrix(0, nrow=k, ncol=N)
for (i in 1:k) X[i, ] <- Chain(group, N, x0[i])

#compute diagnostic statistics
psi <- t(apply(X, 1, cumsum))
for (i in 1:nrow(psi)) psi[i, ] <- psi[i, ] / (1:ncol(psi))
print(Gelman.Rubin(psi))

#plot psi for the four chains
for (i in 1:k)
  plot(psi[i, (b+1):N], type="l",
       xlab=i, ylab=bquote(psi))
par(mfrow=c(1,1)) #restore default

# plot the sequence of R-hat statistics
rhat <- rep(0, N)
for (j in (b+1):N) rhat[j] <- Gelman.Rubin(psi[,1:j])
plot(rhat[(b+1):N], type="l", xlab="", ylab="R")
abline(h=1.2, lty=2)

N <- b+which.min(rhat<=1.2)
X <- Chain(group,N, 0.6)
x_tibble <- tibble(index = 1:N,X)


## ----eval = F------------------------------------------------------------
#  which.min(rhat<=1.2)

## ------------------------------------------------------------------------
set.seed(1)
eps <- .Machine$double.eps^0.25#criterion to judge whether function is near to 0
k <- c(4:25,100,500,1000)#k mentioned in the question

S <- function(k,a){
  return((1-pt(sqrt((a^2*k)/(k+1-a^2)),df = k))-(1-pt(sqrt((a^2*(k-1))/(k-a^2)),df=k-1)))
}#S_k(a)-S_{k-1}(a)


Root <- function(k1){
a <- seq(0.1, sqrt(k1)-0.1,length = 3)
y <- c(S(k1,a[1]), S(k1,a[2]), S(k1,a[3]))
while(abs(y[2]) > eps) {
  if (y[1] * y[2] < 0) {
    a[3] <- a[2]
    y[3] <- y[2]
  } else {
    a[1] <- a[2]
    y[1] <- y[2]
  }
  a[2] <- (a[1] + a[3]) / 2
  y[2] <- S(k1,a[2])
}
result <-list(k1,a[2],y[2])
return(result)
}

for(i in k){#print the output of each k
  cat('k:',Root(i)[[1]],'root:',Root(i)[[2]],'value of function:',Root(i)[[3]],'\n')
  
} 

## ------------------------------------------------------------------------
k <- 4
a <- seq(0.1,sqrt(k)-0.1,0.01)
y0 <- numeric(length(a))
y <- (1-pt(sqrt((a^2*k)/(k+1-a^2)),df = k))-(1-pt(sqrt((a^2*(k-1))/(k-a^2)),df=k-1))
plot(a,y,'l')
lines(a,y0)
cat('The root of curves when k = ',k,'is',Root(k)[[2]],'\n')

k <- 10
a <- seq(0.1,sqrt(k)-0.1,0.01)
y0 <- numeric(length(a))
y <- (1-pt(sqrt((a^2*k)/(k+1-a^2)),df = k))-(1-pt(sqrt((a^2*(k-1))/(k-a^2)),df=k-1))
plot(a,y,'l')
lines(a,y0)
cat('The root of curves when k = ',k,'is',Root(k)[[2]],'\n')



## ----message=F-----------------------------------------------------------
set.seed(1)
library(functional)
library(tidyverse)

density_Cauchy <- function(theta,eta,x){
  1/(theta*pi*(1+((x-eta)/theta)^2))
}
cdf_Cauchy <- function(a,theta,eta) integrate(Curry(density_Cauchy,theta,eta),-Inf,a)$value
p_cauchy <- function(a,theta,eta) pcauchy(a,location = eta,scale = theta)

#for example we set a = 3, theta = 2, eta = 2
a <- 3
theta <- 2
eta <- 2
print(cdf_Cauchy(a,theta,eta))
print(p_cauchy(a,theta,eta))

#for example we set a = 0, theta = 5, eta = 1
a <- 0
theta <- 5
eta <- 1
print(cdf_Cauchy(a,theta,eta))
print(p_cauchy(a,theta,eta))




## ------------------------------------------------------------------------
x <- seq(-10,10,1)
theta <- 2
eta <- 3
m <- 1
y1 <- numeric(length(x))
y2 <- numeric(length(x))


for(i in x){
  y1[m] <- cdf_Cauchy(i,theta,eta)
  y2[m] <- p_cauchy(i,theta,eta)
  m <- m+1
}

cauchy <- tibble(x=c(x,x),y=c(y1,y2),method=rep(c('integral','pcauchy'),each = 21))
ggplot(cauchy,aes(x,y))+
  geom_col(aes(fill = method),position = 'dodge')


## ------------------------------------------------------------------------
N=1000
na=28
nb=24
noo=41
nab=70
p=0.3   #initial est. for p
q=0.3
r=0.3
pm=numeric(0)
qm=numeric(0)
rm=numeric(0)
lofm=numeric(0)
lof=function(p,q,r){   #log maximum likelihood values
  return(log(choose(n,naa)*choose(n-naa,nbb)*choose(n-naa-nbb,noo)*choose(nao+nbo+nab,nao)*
            choose(nbo+nab,nbo))+
           (nao+nbo+nab)*log(2)+
           (2*naa+nao+nab)*log(p)+(2*nbb+nbo+nab)*log(q)+(2*noo+nao+nbo)*log(r))
}

for (j in 1:N){
  naa=round(na*p^2/(p^2+2*p*r))
  nbb=round(nb*q^2/(q^2+2*q*r))
  nao=na-naa
  nbo=nb-nbb
  n=naa+nbb+noo+nao+nbo+nab
  if(abs(p-(2*naa+nao+nab)/2/n)<1e-8&&abs(q-(2*nbb+nbo+nab)/2/n)<1e-8&&abs(r-(2*noo+nbo+nao)/2/n)<1e-8&&j>5) {print(j);break}
  p=(2*naa+nao+nab)/2/n  #update estimation
  q=(2*nbb+nbo+nab)/2/n
  r=(2*noo+nbo+nao)/2/n
  pm=c(pm,p)
  qm=c(qm,q)
  rm=c(rm,r)
  lofm=c(lofm,lof(p,q,r))
}

print(c(p,q,r))
print(exp(lofm))

## ------------------------------------------------------------------------
formulas <- list( 
  mpg ~ disp,
  mpg ~ I(1 / disp),
  mpg ~ disp + wt,
  mpg ~ I(1 / disp) + wt
)

## ------------------------------------------------------------------------
set.seed(1)
n <- length(formulas)

attach(mtcars)

# loop
for (i in 1:n){
   print(lm(formulas[[i]]))
}

# lapply
lapply(formulas, lm)

detach()

## ------------------------------------------------------------------------

bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = TRUE)
  mtcars[rows, ]
})

## ------------------------------------------------------------------------
# loop
for(i in 1:10){
  print(lm(bootstraps[[i]]$mpg~bootstraps[[i]]$disp))
}

# lapply
for(i in 1:10){
  bootstraps[[i]] <- subset(bootstraps[[i]],select=c(mpg,disp))
}

lapply(bootstraps,lm)

## ----eval=F--------------------------------------------------------------
#  rsq <- function(mod) summary(mod)$r.squared

## ------------------------------------------------------------------------
# Exercise 3
attach(mtcars)

# loop
for (i in 1:n){
   print(summary(lm(formulas[[i]]))$r.squared)
}

# lapply
lapply(formulas,function(x){
  summary(lm(x))$r.squared
})

detach()

# Exercise 4
# loop
for(i in 1:10){
  print(summary(lm(bootstraps[[i]]$mpg~bootstraps[[i]]$disp))$r.squared)
}

# lapply
lapply(bootstraps,function(x){
      summary(lm(x$mpg~x$disp))$r.squared})


## ------------------------------------------------------------------------
trials <- replicate(
  100,
  t.test(rpois(10, 10), rpois(7, 10)),
  simplify = FALSE
)

## ------------------------------------------------------------------------
sapply(trials,function(x){
  return(x$p.value)
})

## ------------------------------------------------------------------------
data <- matrix(rnorm(20, 0, 10), nrow = 4)
x <- as.data.frame(data)
answer1 <- Map("/",x,vapply(x,mean,c(1)))
answer2 <- lapply(x,function(data){data/(mean(data))})
print(data.frame(unlist(answer1),unlist(answer2)))

## ------------------------------------------------------------------------
chisq.test_simp <- function(x, y = NULL, correct = T,p = rep(1/length(x),length(x)),rescale.p = F, simulate.p.value = F, B = 2000)
  {
    DNAME <- deparse(substitute(x))
    if (is.data.frame(x)) 
        x <- as.matrix(x)
    if (is.matrix(x)) {
        if (min(dim(x)) == 1L) 
            x <- as.vector(x)
    }
    if (!is.matrix(x) && !is.null(y)) {
        if (length(x) != length(y)) 
            stop("'x' and 'y' must have the same length")
        DNAME2 <- deparse(substitute(y))
        xname <- if (length(DNAME) > 1L || nchar(DNAME, "w") > 
            30) 
            ""
        else DNAME
        yname <- if (length(DNAME2) > 1L || nchar(DNAME2, "w") > 
            30) 
            ""
        else DNAME2
        OK <- complete.cases(x, y)
        x <- factor(x[OK])
        y <- factor(y[OK])
        if ((nlevels(x) < 2L) || (nlevels(y) < 2L)) 
            stop("'x' and 'y' must have at least 2 levels")
        x <- table(x, y)
        names(dimnames(x)) <- c(xname, yname)
        DNAME <- paste(paste(DNAME, collapse = "\n"), "and", 
            paste(DNAME2, collapse = "\n"))
    }
    if (any(x < 0) || anyNA(x)) 
        stop("all entries of 'x' must be nonnegative and finite")
    if ((n <- sum(x)) == 0) 
        stop("at least one entry of 'x' must be positive")
    if (simulate.p.value) {
        setMETH <- function() METHOD <<- paste(METHOD, "with simulated p-value\n\t (based on", 
            B, "replicates)")
        almost.1 <- 1 - 64 * .Machine$double.eps
    }
    if (is.matrix(x)) {
        METHOD <- "Pearson's Chi-squared test"
        nr <- as.integer(nrow(x))
        nc <- as.integer(ncol(x))
        if (is.na(nr) || is.na(nc) || is.na(nr * nc)) 
            stop("invalid nrow(x) or ncol(x)", domain = NA)
        sr <- rowSums(x)
        sc <- colSums(x)
        E <- outer(sr, sc, "*")/n
        v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3
        V <- outer(sr, sc, v, n)
        dimnames(E) <- dimnames(x)
        if (simulate.p.value && all(sr > 0) && all(sc > 0)) {
            setMETH()
            tmp <- .Call(C_chisq_sim, sr, sc, B, E)
            STATISTIC <- sum(sort((x - E)^2/E, decreasing = TRUE))
            PARAMETER <- NA
            PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC))/(B + 
                1)
        }
        else {
            if (simulate.p.value) 
                warning("cannot compute simulated p-value with zero marginals")
            if (correct && nrow(x) == 2L && ncol(x) == 2L) {
                YATES <- min(0.5, abs(x - E))
                if (YATES > 0) 
                  METHOD <- paste(METHOD, "with Yates' continuity correction")
            }
            else YATES <- 0
            STATISTIC <- sum((abs(x - E) - YATES)^2/E)
            PARAMETER <- (nr - 1L) * (nc - 1L)
            PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
        }
    }
    else {
        if (length(dim(x)) > 2L) 
            stop("invalid 'x'")
        if (length(x) == 1L) 
            stop("'x' must at least have 2 elements")
        if (length(x) != length(p)) 
            stop("'x' and 'p' must have the same number of elements")
        if (any(p < 0)) 
            stop("probabilities must be non-negative.")
        if (abs(sum(p) - 1) > sqrt(.Machine$double.eps)) {
            if (rescale.p) 
                p <- p/sum(p)
            else stop("probabilities must sum to 1.")
        }
        METHOD <- "Chi-squared test for given probabilities"
        E <- n * p
        V <- n * p * (1 - p)
        STATISTIC <- sum((x - E)^2/E)
        names(E) <- names(x)
        if (simulate.p.value) {
            setMETH()
            nx <- length(x)
            sm <- matrix(sample.int(nx, B * n, TRUE, prob = p), 
                nrow = n)
            ss <- apply(sm, 2L, function(x, E, k) {
                sum((table(factor(x, levels = 1L:k)) - E)^2/E)
            }, E = E, k = nx)
            PARAMETER <- NA
            PVAL <- (1 + sum(ss >= almost.1 * STATISTIC))/(B + 
                1)
        }
        else {
            PARAMETER <- length(x) - 1
            PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
        }
    }
    names(STATISTIC) <- "X-squared"
    names(PARAMETER) <- "df"
    if (any(E < 5) && is.finite(PARAMETER)) 
        warning("Chi-squared approximation may be incorrect")
    return(PVAL)
}

x <- c(100,200,150,172,134)
y <- c(100,104,103,98)
chisq.test_simp(x)
chisq.test_simp(y)

## ------------------------------------------------------------------------
table2 <- function(x, y) {
  x_val <- unique(x)
  y_val <- unique(y)
  mat <- matrix(0L, length(x_val), length(y_val))
  for (i in seq_along(x)) {
    mat[which(x_val == x[[i]]), which(y_val == y[[i]])] <-
      mat[which(x_val == x[[i]]),  which(y_val == y[[i]])] + 1L
  }
  dimnames <- list(x_val, y_val)
  names(dimnames) <- as.character(as.list(match.call())[-1])
  tab <- array(mat, dim = dim(mat), dimnames = dimnames)
  class(tab) <- "table"
  tab
}

a <- c(1, 4, 2, 2, 1, 4)
b <- c(2, 3, 4, 2, 3, 4)
table2(a,b)
pikachuuu108/StatComp18018 documentation built on May 19, 2019, 9:38 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
pikachuuu108/StatComp18018
18018's final project for 'Statistical Computing' course

inst/doc/18018.R
In pikachuuu108/StatComp18018: 18018's final project for 'Statistical Computing' course

R Package Documentation

Browse R Packages

We want your feedback!

pikachuuu108/StatComp18018 18018's final project for 'Statistical Computing' course

inst/doc/18018.R In pikachuuu108/StatComp18018: 18018's final project for 'Statistical Computing' course

R Package Documentation

Browse R Packages

We want your feedback!

pikachuuu108/StatComp18018
18018's final project for 'Statistical Computing' course

inst/doc/18018.R
In pikachuuu108/StatComp18018: 18018's final project for 'Statistical Computing' course