doc/StatCompAnswer.R

## ----eval=FALSE----------------------------------------------------------
#  #The first example:
#  
#  x <- rnorm(10)
#  y <- rnorm(10)
#  plot(x,y)
#  
#  
#  #The second example:
#  
#  data("InsectSprays")
#  aov.spray <- aov(sqrt(count) ~ spray,data=InsectSprays)
#  summary(aov.spray)
#  
#  
#  #The third example:
#  
#  m1 <- matrix(1,2,2)
#  m2 <- matrix(2,2,2)
#  rbind(m1,m2)
#  cbind(m1,m2)
#  

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  
#  x <- c(0,1,2,3,4)
#  p <- c(0.1,0.2,0.2,0.2,0.3)
#  cp <- cumsum(p)
#  m <- 1e4
#  r <- numeric(m)
#  r <- x[findInterval(runif(m),cp)+1]
#  ct <- table(r)/m
#  x <- sample(c(0,1,2,3,4),m,p,replace = TRUE)
#  y <- table(x)/m
#  rbind(prob=p,prob1=ct,prob2=y)
#  
#  #The answer of second question:
#  
#  beta <- function(n,a,b){
#  j<-k<-0
#  y <- numeric(n)
#  while (k < n) {
#    u <- runif(1)
#    j <- j + 1
#    x <- runif(1)
#    if (x^(a-1)* (1-x)^(b-1) > u) {
#      k <- k + 1
#      y[k] <- x
#    }
#  }
#  return (y)
#  }
#  
#  r<-beta(1000,3,2)
#  hist(r,prob=TRUE,main="the histogram of Beta(3,2) and its density",col="red")
#  t<-seq(0,1,0.001)
#  s<-dbeta(t,3,2)
#  lines(t,s,col="blue",lwd=3)
#  
#  #The answer of third question:
#  
#  n <- 1e3; r <- 4; beta <- 2
#  lambda <- rgamma(n, r, beta)
#  y <- rexp(n, lambda)
#  y
#  

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  mtbeta <- function(x){
#    m <- 1e5
#    t <- runif(m,0,x)
#    mean(30*t*t*(1-t)*(1-t))*x
#  }
#  mbeta <- Vectorize(mtbeta)
#  s <- seq(0.1,0.9,0.1)
#  print(round(rbind(x=s,p=pbeta(s,3,3),phat=mbeta(s),se=mbeta(s)-pbeta(s,3,3)),5))
#  
#  #The answer of second question:
#  mtrayl <- function(x, R = 10000, antithetic = TRUE) {
#  u <- runif(R/2)
#  if (!antithetic) v <- runif(R/2) else
#  v <- 1 - u
#  u <- c(u, v)
#  cdf <- numeric(length(x))
#  for (i in 1:length(x)) {
#  g <- x[i]^2*u * exp(-(u * x[i])^2 / 2)
#  cdf[i] <- mean(g)
#  }
#  cdf
#  }
#  x <- seq(0, 2.5,0.5)
#  set.seed(123)
#  mtr1 <- mtrayl(x, anti = FALSE)
#  set.seed(123)
#  mtr2 <- mtrayl(x,anti = TRUE)
#  print(round(rbind(x, mtr1, mtr2), 5))
#  m <- 1000
#  mtr3 <- mtr4 <- numeric(m)
#  x <- 1
#  for (i in 1:m) {
#  mtr3[i] <- mtrayl(x,antithetic = FALSE)
#  mtr4[i] <- mtrayl(x)
#  }
#  print(var(mtr3))
#  print(var(mtr4))
#  print((var(mtr3) - var(mtr4))/var(mtr3))
#  
#  #The answer of third question:
#  m <- 10000
#  se <- numeric(2)
#  g <- function(x) {
#  exp(-x^2/2)*x^2/(sqrt(2*pi)) * (x > 1)
#  }
#  
#  x <- rweibull(m, 2,sqrt(2))
#  fg1 <- g(x) /(x* exp(-x^2/2))
#  
#  se[1] <- sd(fg1)
#  
#  x <- rexp(m, 1)
#  fg2 <- g(x) / exp(-x)
#  
#  se[2] <- sd(fg2)
#  se
#  
#  #The answer of fourth question:
#  m <- 10000
#  g <- function(x) {
#  exp(-x^2/2)*x^2/(sqrt(2*pi)) * (x > 1)
#  }
#  x <- rweibull(m, 2,sqrt(2))
#  fg1 <- g(x) /(x* exp(-x^2/2))
#  theta1 <- mean(fg1)
#  theta1

## ----eval=FALSE----------------------------------------------------------
#  
#  #The answer of first question:
#  m <- 1000
#  n <- 20
#  g <- numeric(m)
#  medians  <- means <- numeric(3)
#  y <- gini1 <- gini2 <- gini3 <- numeric(m)
#  
#  for (i in 1:m) {
#    x <- sort(rlnorm(n))
#    xmean <- mean(x)
#    for (j in 1:n) {
#    y[j] <- (2*j-n-1)*x[j]
#  }
#    gini1[i] <- 1/n^2/xmean*sum(y[1:n])
#  }
#  
#  for (i in 1:m) {
#    x <- sort(runif(n))
#    xmean <- mean(x)
#    for (j in 1:n) {
#    y[j] <- (2*j-n-1)*x[j]
#  }
#    gini2[i] <- 1/n^2/xmean*sum(y[1:n])
#  }
#  
#  for (i in 1:m) {
#    x <- sort(rbinom(n,c(0,1),c(0.1,0.9)))
#    xmean <- mean(x)
#    for (j in 1:n) {
#    y[j] <- (2*j-n-1)*x[j]
#  }
#    gini3[i] <- 1/n^2/xmean*sum(y[1:n])
#  }
#  
#  par(mfrow=c(3,1))
#  par(pin=c(2,1))
#  hist(gini1,prob = TRUE)
#  lines(density(gini1),col = "red",lwd = 2)
#  hist(gini2,prob = TRUE)
#  lines(density(gini2),col = "blue",lwd = 2)
#  hist(gini3,prob = TRUE)
#  lines(density(gini3),col = "green",lwd = 2)
#  
#  medians[1] <- median(gini1)
#  medians[2] <- median(gini2)
#  medians[3] <- median(gini3)
#  medians
#  
#  quantiles1 <- as.vector(quantile(gini1,seq(0.1,0.9,0.1)))
#  quantiles1
#  
#  quantiles2 <- as.vector(quantile(gini2,seq(0.1,0.9,0.1)))
#  quantiles2
#  
#  quantiles3 <- as.vector(quantile(gini3,seq(0.1,0.9,0.1)))
#  quantiles3
#  
#  means[1] <- mean(gini1)
#  means[2] <- mean(gini2)
#  means[3] <- mean(gini3)
#  means
#  
#  
#  #The answer of second question:
#  m <- 1000
#  n <- 20
#  gini <- numeric(m)
#  
#  #Get A series of gini ratios genarating from a lognormal distribution
#  for (i in 1:m) {
#    x <- sort(rlnorm(n))
#    xmean <- mean(x)
#    for (j in 1:n) {
#    y[j] <- (2*j-n-1)*x[j]
#  }
#    gini[i] <- 1/n^2/xmean*sum(y[1:n])
#  }
#  
#  #get the lower confidence interval
#  LCI<- mean(gini)-sd(gini)*qt(0.975,m-1)
#  #get the upper confidence interval
#  UCI <- mean(gini)+sd(gini)*qt(0.975,m-1)
#  #get the confidence interval
#  CI <- c(LCI,UCI)
#  print(CI)
#  #calculate the converage rte
#  covrate<-sum(I(gini>CI[1]&gini<CI[2]))/m
#  print(covrate)
#  
#  
#  #The answer of third question:
#  #We need load the MASS package
#  library(MASS)
#  mean <- c(0, 0)
#  sigma <- matrix( c(25,5,
#                      5, 25),nrow=2, ncol=2)
#  m <- 1000
#  
#  #Calculate the power using pearson correlation test by setting the parameter method as pearson
#  pearvalues <- replicate(m, expr = {
#      mydata1 <- mvrnorm(50, mean, sigma)
#      x <- mydata1[,1]
#      y <- mydata1[,2]
#      peartest <- cor.test(x,y,alternative = "two.sided", method = "pearson")
#      peartest$p.value
#  } )
#  power1 <- mean(pearvalues <= 0.05)
#  power1
#  
#  #Calculate the power using spearman correlation test by setting the parameter method as spearman
#  spearvalues <- replicate(m, expr = {
#      mydata2 <- mvrnorm(50, mean, sigma)
#      x <- mydata2[,1]
#      y <- mydata2[,2]
#      speartest <- cor.test(x,y,alternative = "two.sided", method = "spearman")
#      speartest$p.value
#  } )
#  power2 <- mean(spearvalues <= 0.05)
#  power2
#  
#  #Calculate the power using kendall correlation test by setting the parameter method as kendall
#  kenvalues <- replicate(m, expr = {
#      mydata3 <- mvrnorm(50, mean, sigma)
#      x <- mydata3[,1]
#      y <- mydata3[,2]
#      kentest <- cor.test(x,y,alternative = "two.sided", method = "kendall")
#      kentest$p.value
#  } )
#  power3 <- mean(kenvalues <= 0.05)
#  power3
#  
#  R <- 1000
#  m <- 1000
#  
#  #Calculate the power using pearson correlation test by setting the parameter method as pearson
#  pearvalues <- replicate(m, expr = {
#      u <- runif(R/2,0,10)
#      v <- sin(u)
#      peartest <- cor.test(u,v,alternative = "two.sided", method = "pearson")
#      peartest$p.value
#  } )
#  power1 <- mean(pearvalues <= 0.05)
#  power1
#  
#  #Calculate the power using spearman correlation test by setting the parameter method as spearman
#  spearvalues <- replicate(m, expr = {
#      u <- runif(R/2,0,10)
#      v <- sin(u)
#      speartest <- cor.test(u,v,alternative = "two.sided", method = "spearman")
#      speartest$p.value
#  } )
#  power2 <- mean(spearvalues <= 0.05)
#  power2
#  
#  #Calculate the power using kendall correlation test by setting the parameter method as kendall
#  kenvalues <- replicate(m, expr = {
#      u <- runif(R/2,0,10)
#      v <- sin(u)
#      kentest <- cor.test(u,v,alternative = "two.sided", method = "kendall")
#      kentest$p.value
#  } )
#  power3 <- mean(kenvalues <= 0.05)
#  power3
#  

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  LAST <- c(576,635,558,578 ,666 ,580 ,555 ,661 ,651 ,605 ,653 ,575 ,545 ,572 ,594)
#  GPA <- c(339 ,330 ,281 ,303 ,344 ,307 ,300 ,343 ,336 ,313 ,312 ,274 ,276 ,288 ,296)
#  n <- length(LAST)
#  theta.hat <- cor(LAST,GPA)  #the Correlation between the LAST and GPA we give above
#  theta.jack <- numeric(n)
#  library('bootstrap')
#  #compute the jackknife replicates, leave-one-out estimates
#  for (i in 1:n)
#    theta.jack[i] <- cor(LAST[-i],GPA[-i])
#  bias <- (n - 1) * (mean(theta.jack) - theta.hat)
#  #jackknife estimate of bias
#  print(bias)
#  se <- sqrt((n-1)* mean((theta.jack - mean(theta.jack))^2))
#  print(se)
#  
#  
#  #The answerof second question:
#  set.seed(1)
#  air <- c(3,5,7,18,43,85,91,98,100,130,230,487)
#  library(boot)
#  lamda.boot <- function(data,ind) {
#    f <- function (lamda)   n/lamda-s
#    n <- length(data[ind])
#    s <- sum(data[ind])
#  
#    #the function uniroot can be used to get the root
#    root <- uniroot(f,c(0,1))$root
#    1/root
#  }
#  boot.obj <- boot(data=air, statistic = lamda.boot, R = 3000)
#  print(boot.obj)
#  print(boot.ci(boot.obj,type = c("basic", "norm", "perc","bca")))
#  
#  
#  #The answer of third question:
#  library(bootstrap)
#  n <- nrow(scor)
#  #get the eigenvalues
#  eigenvalues <- eigen(cov(scor))$values
#  #the estimate computed from the original observed sample
#  theta.hat <- max(eigenvalues)/sum(eigenvalues)
#  theta.jack <- numeric(n)
#  for (i in 1:n)
#    theta.jack[i] <- max(eigen(cov(scor[-i,]))$values)/sum(eigen(cov(scor[-i,]))$values)
#  bias <- (n - 1) * (mean(theta.jack) - theta.hat)
#  print(bias) #jackknife estimate of bias
#  se <- sqrt((n-1)* mean((theta.jack - mean(theta.jack))^2))
#  print(se)
#  
#  #The answer of fourth question:
#  library(DAAG)
#  attach(ironslag)
#  #the length of the chemical
#  n <- length(chemical)
#  #A pairwise combination of 1 and n
#  comb <- t(combn(1:n,2))
#  e1 <- e2 <- e3 <- e4 <- e5 <- e6 <- e7 <- e8  <- numeric(n)
#  for (k in 1:n) {
#    #comb is a matrix with two columns
#    k1 <- comb[k,1]
#    k2 <- comb[k,2]
#    #get the training set
#    x <- chemical[-k1][-(k2-1)]
#    y <- magnetic[-k1][-(k2-1)]
#    #use the training set to get the models
#    J1 <- lm(y ~ x)
#    #use the test set to get the prediction error
#    yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k1]
#    yhat2 <- J1$coef[1] + J1$coef[2] * chemical[k2]
#    e1[k] <- magnetic[k1] - yhat1
#    e2[k] <- magnetic[k2] - yhat2
#    #the following procedures are the same as the above
#    J2 <- lm(y ~ x + I(x^2))
#    yhat3 <- J2$coef[1] + J2$coef[2] * chemical[k1] +
#      J2$coef[3] * chemical[k1]^2
#    yhat4 <- J2$coef[1] + J2$coef[2] * chemical[k2] +
#      J2$coef[3] * chemical[k2]^2
#    e3[k] <- magnetic[k1] - yhat3
#    e4[k] <- magnetic[k2] - yhat4
#  
#    J3 <- lm(log(y) ~ x)
#    logyhat5 <- J3$coef[1] + J3$coef[2] * chemical[k1]
#    logyhat6 <- J3$coef[1] + J3$coef[2] * chemical[k2]
#    yhat5 <- exp(logyhat5)
#    yhat6 <- exp(logyhat6)
#    e5[k] <- magnetic[k1] - yhat5
#    e6[k] <- magnetic[k2] - yhat6
#  
#    J4 <- lm(log(y) ~ log(x))
#    logyhat7 <- J4$coef[1] + J4$coef[2] * log(chemical[k1])
#    logyhat8 <- J4$coef[1] + J4$coef[2] * log(chemical[k2])
#    yhat7 <- exp(logyhat7)
#    yhat8 <- exp(logyhat8)
#    e7[k] <- magnetic[k1] - yhat7
#    e8[k] <- magnetic[k2] - yhat8
#  
#  }
#  
#  # get the prediction error by leave-two-out cross validation.
#  c(mean(e1^2+e2^2), mean(e3^2+e4^2), mean(e5^2+e6^2), mean(e7^2+e8^2))
#  

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  set.seed(1)
#  x <- c(158, 171 ,193 ,199 ,230 ,243 ,248 ,248 ,250 ,267 ,271 ,316 ,327 ,329)
#  y <- c(141 ,148 ,169 ,181 ,203 ,213 ,229 ,244 ,257 ,260 ,271 ,309)
#  #the function cvm.test is used to calculate the Cramer-von Mises statistic
#  cvm.test <- function(x,y){
#    #the empirical distribution function of the x,y
#    F <- ecdf(x)
#    G <- ecdf(y)
#    n <- length(x)
#    m <- length(y)
#    s <- numeric(n)
#    t <- numeric(m)
#    for (i  in 1:n) {
#      s[i] <- (F(x[i])-G(x[i]))^2
#    }
#    s <- sum(s)
#    for (j  in 1:m) {
#      t[j] <- (F(y[j])-G(y[j]))^2
#    }
#    t <- sum(t)
#    #return the Cramer-von Mises statistic
#    return (m*n*(s+t)/(m+n)^2)
#  }
#  #number of replicates
#  R <- 999
#  #pooled sample
#  z <- c(x, y)
#  K <- 1:26
#  #storage for replicates
#  reps <- numeric(R)
#  t0 <- cvm.test(x, y)
#  for (i in 1:R) {
#    #generate indices k for the first sample
#    k <- sample(K, size = 14, replace = FALSE)
#    x1 <- z[k]
#    y1 <- z[-k] #complement of x1
#    reps[i] <- cvm.test(x1, y1)
#  }
#  p <- mean(c(t0, reps) >= t0)
#  p
#  hist(reps, main = "", freq = FALSE, xlab = "T (p = 0.421)",
#  breaks = "scott")
#  points(t0, 0, cex = 1, pch = 16)
#  
#  
#  #The answer of second question:
#  library(RANN)
#  library(boot)
#  library(energy)
#  library(Ball)
#  m <- 100; k<-3; p<-2; mu <- 0.2; set.seed(12345)
#  n1 <- n2 <- 20; R<-50; n <- n1+n2; N = c(n1,n2)
#  Tn <- function(z, ix, sizes,k) {
#    n1 <- sizes[1]; n2 <- sizes[2]; n <- n1 + n2
#    if(is.vector(z)) z <- data.frame(z,0);
#    z <- z[ix, ];
#    NN <- nn2(data=z, k=k+1)
#    block1 <- NN$nn.idx[1:n1,-1]
#    block2 <- NN$nn.idx[(n1+1):n,-1]
#    i1 <- sum(block1 < n1 + .5); i2 <- sum(block2 > n1+.5)
#    (i1 + i2) / (k * n)
#  }
#  #the nn method and return the p.values
#  eqdist.nn <- function(z,sizes,k){
#    boot.obj <- boot(data=z,statistic=Tn,R=R,
#                     sim = "permutation", sizes = sizes,k=k)
#    ts <- c(boot.obj$t0,boot.obj$t)
#    p.value <- mean(ts>=ts[1])
#    list(statistic=ts[1],p.value=p.value)
#  }
#  
#  p.values1 <- matrix(NA,m,3)
#  p.values2 <- matrix(NA,m,3)
#  p.values3 <- matrix(NA,m,3)
#  p.values4 <- matrix(NA,m,3)
#  for(i in 1:m){
#    #the sample x is from the standard normal distribution and the sample y is from the normal diisreibution with mean=0,variance is 2
#    x <- matrix(rnorm(n1*p),ncol=p);
#    y <- cbind(rnorm(n2,sd=2),rnorm(n2,sd=2));
#    z <- rbind(x,y)
#    p.values1[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values1[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values1[i,3] <- bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
#  }
#  alpha <- 0.1;
#  pow1 <- colMeans(p.values1<alpha)
#  names(pow1) <- c('NN','energy', 'ball ')
#  #get the powers by the three method
#  pow1
#  
#  #under the situation of unequal variances and unequal expectations
#  for(i in 1:m){
#      #the sample x is from the standard normal distribution and the sample y is from the normal diisreibution with mean=0.2 ,variance is 2
#    x <- matrix(rnorm(n1*p),ncol=p);
#    y <- cbind(rnorm(n2,mean = mu,sd=2),rnorm(n2,mean=mu,sd=2));
#    z <- rbind(x,y)
#    p.values2[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values2[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values2[i,3] <- bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
#  }
#  alpha <- 0.1;
#  pow2 <- colMeans(p.values2<alpha)
#  names(pow2) <- c('NN','energy', 'ball ')
#  #get the powers by the three method
#  pow2
#  
#  #under the situation of non-normal distributions
#  for(i in 1:m){
#        #the sample x is from t distribution with df=1 and the sample y is from the mixture of two normal distributions
#    x <- matrix(rt(n1*p,df=1),ncol = p);
#    y <- cbind(rnorm(n2),rnorm(n2,mean=mu,sd=4));
#    z <- rbind(x,y)
#    p.values3[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values3[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values3[i,3] <- bd.test(x=x,y=y,R=999,seed=i*12345)$p.value
#  }
#  alpha <- 0.1;
#  pow3 <- colMeans(p.values3<alpha)
#  names(pow3) <- c('NN','energy', 'ball ')
#  #get the powers by the three method
#  pow3
#  
#  #under the situation of Unbalanced samples which the number of x is 200 and y is 20
#  n1 <- 200
#  n2 <- 20
#  n <- n1+n2
#  N = c(n1,n2)
#  for(i in 1:m){
#    x <- matrix(rnorm(n1*p),ncol=p);
#    y <- cbind(rnorm(n2),rnorm(n2));
#    z <- rbind(x,y)
#    p.values4[i,1] <- eqdist.nn(z,N,k)$p.value
#    p.values4[i,2] <- eqdist.etest(z,sizes=N,R=R)$p.value
#    p.values4[i,3] <- bd.test(x=x,y=y,R=99,seed=i*12345)$p.value
#  }
#  alpha <- 0.18;
#  pow4 <- colMeans(p.values4<alpha)
#  names(pow4) <- c('NN','energy', 'ball ')
#  #get the powers by the three method
#  pow4
#  
#  
#  #The answer of third question:
#  set.seed(1)
#  #build a standard Cauchy distribution
#  f <- function(x, x0=0, gamma=1){
#    out<-1/(pi*gamma*(1+((x-x0)/gamma)^2))
#    return(out)
#  
#  }
#  #the times of simulation
#  m <- 80000
#  x <- numeric(m)
#  #generat the normal proposal distribution with mean=xt ,sd=1
#  x[1] <- rnorm(1,mean=0,sd=1 )
#  k <- 0
#  u <- runif(m)
#  for (i in 2:m) {
#    xt <- x[i-1]
#    y <- rnorm(1, mean = xt,sd=1)
#    num <- f(y) * dnorm(xt, mean = y,sd=1)
#    den <- f(xt) * dnorm(y, mean = xt,sd=1)
#    if (u[i] <= num/den) x[i] <- y else {
#      x[i] <- xt
#      k <- k+1 #y is rejected
#    }
#  }
#  #discard the burnin sample
#  b <- 1001
#  y <- x[b:m]
#  a <- ppoints(200)
#  #quantiles of cauchy distribution
#  Qcauchy <- qcauchy(a)
#  #quantiles of sample distribution
#  Q <- quantile(x, a)
#  qqplot(Qcauchy, Q,xlim=c(-2,2),ylim=c(-2,2),xlab="Cauchy Quantiles", ylab="Sample Quantiles",main = expression("Q-Q plot for Cauchy distribution"))
#  hist(y, breaks=50, main="", xlab="", freq=FALSE)
#  lines(Qcauchy, f(Qcauchy))
#  
#  #The answer of fourth question:
#  size <- c(125,18,20,34)
#  #The following function prob computes the target density
#  prob <- function(y, size) {
#    if (y < 0 || y >1)
#      return (0)
#    return((1/2+y/4)^size[1] *
#             ((1-y)/4)^size[2] * ((1-y)/4)^size[3] *(y/4)^size[4])
#  }
#  #length of the chain
#  m <- 5000
#  #width of the uniform support set
#  w <- .25
#  #burn-in time
#  burn <- 1000
#  #for accept/reject step
#  u <- runif(m)
#  #proposal distribution
#  v <- runif(m, -w, w)
#  x[1] <- .25
#  for (i in 2:m) {
#    y <- x[i-1] + v[i]
#    if (u[i] <= prob(y, size) / prob(x[i-1], size))
#      x[i] <- y else
#        x[i] <- x[i-1]
#  }
#  xb <- x[(burn+1):m]
#  print(mean(xb))

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  set.seed(1)
#  size <- c(125,18,20,34)
#  #The following function prob computes the target density
#  prob <- function(y, size) {
#    if (y < 0 || y >1)
#      return (0)
#    return((1/2+y/4)^size[1] *
#             ((1-y)/4)^size[2] * ((1-y)/4)^size[3] *(y/4)^size[4])
#  }
#  
#  Gelman.Rubin <- function(psi) {
#    # psi[i,j] is the statistic psi(X[i,1:j])
#    # for chain in i-th row of X
#    psi <- as.matrix(psi)
#    n <- ncol(psi)
#    k <- nrow(psi)
#    psi.means <- rowMeans(psi) #row means
#    B <- n * var(psi.means) #between variance est.
#    psi.w <- apply(psi, 1, "var") #within variances
#    W <- mean(psi.w) #within est.
#    v.hat <- W*(n-1)/n + (B/n) #upper variance est.
#    r.hat <- v.hat / W #G-R statistic
#    return(r.hat)
#  }
#  
#  multinomial.chain <- function(N, X1) {
#    #generates a Metropolis chain for multino-mial distribution
#    #with uniform(-w,w) proposal distribution
#    #and starting value X1
#    #length of the chain
#    #width of the uniform support set
#    w <- 0.15
#    #burn-in time
#    burn <- 1000
#    #for accept/reject step
#    u <- runif(N)
#    #proposal distribution
#    v <- runif(N, -w, w)
#    x <- rep(0, N)
#    x[1] <- X1
#    for (i in 2:N) {
#      y <- x[i-1] + v[i]
#      r1 <- prob(y, size)
#      r2 <- prob(x[i-1], size)
#      r <- r1/r2
#      if (u[i] <= r)
#        x[i] <- y else
#          x[i] <- x[i-1]
#    }
#    return(x)
#  }
#  
#  k <- 4 #number of chains to generate
#  n <- 10000 #length of chains
#  b <- 2000 #burn-in length
#  #choose overdispersed initial values
#  x0 <- c(0.1,0.4,0.6,0.9)
#  #generate the chains
#  X <- matrix(0, nrow=k, ncol=n)
#  for (i in 1:k)
#    X[i, ] <- multinomial.chain(n, x0[i])
#  #compute diagnostic statistics
#  psi <- t(apply(X, 1, cumsum))
#  for (i in 1:nrow(psi))
#    psi[i,] <- psi[i,] / (1:ncol(psi))
#  print(Gelman.Rubin(psi))
#  #get sequences of the running means ?? for four Metropolis-Hastings chains
#  par(mfrow=c(2,2))
#  for (i in 1:k)
#    plot(psi[i, (b+1):n], type="l",
#         xlab=i, ylab=bquote(psi))
#  par(mfrow=c(1,1)) #restore default
#  rhat <- rep(0, n)
#  for (j in (b+1):n)
#  rhat[j] <- Gelman.Rubin(psi[,1:j])
#  plot(rhat[(b+1):n], type="l", xlab="", ylab="R")
#  abline(h=1.05, lty=2)
#  
#  
#  #The answer of second question:
#  m <- c(4:25,100,500,1000)
#  root <- numeric(length(m))
#  f1 <- function(a,k) pt(sqrt(a^2*(k-1)/(k-a^2)),k-1)-pt(sqrt(a^2*(k)/(k+1-a^2)),k)
#  for (i in 4:25) {
#    res <- uniroot(f1,c(1e-5,2),k=i)
#    root[i-3] <- res$root
#  }
#  res1 <- uniroot(f1,c(1e-5,2),k=100)
#  root[23] <- res1$root
#  res2 <- uniroot(f1,c(1e-5,2),k=500)
#  root[24] <- res2$root
#  res3 <- uniroot(f1,c(1e-5,2),k=1000)
#  root[25] <- res3$root
#  data <- data.frame(k=m,a=root,row.names=1)
#  data

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  options(digits=6)
#  #get the density of cauchy distribution with the parameter theta and eta
#  f <- function(x, theta, eta) {
#    1/theta/pi/(1+((x-eta)/theta)^2)
#  }
#  #the vector x is used to get the cdf of cauchy distribution when theta is 1,eta is 0
#  x <- seq(-5,5,0.1)
#  values <- numeric(length(x))
#  #the vector x2 is used to get the cdf of cauchy distribution when theta is 1,eta is 10
#  x2 <- seq(0,10,0.1)
#  values2 <- numeric(length(x2))
#  for (i in 1:length(x))
#    values[i] <- integrate(f, lower=-Inf, upper=x[i],rel.tol=.Machine$double.eps^0.25,theta=1, eta=0)$value
#  #
#  data <- data.frame(value=values,truevalues=pcauchy(x),errors=values-pcauchy(x))
#  data[1:20,]
#  #compare the cauchy distributions,one of them is used integrate and the other is used pcauchy
#  plot(x,values,pch=21,xlab="the values of x", ylab="the cauchy distribution function ",main = expression("the compare for Cauchy distribution when \ntheta is 1 and eta is 0 with two methods"))
#  lines(x,pcauchy(x),col='red',lwd=2)
#  
#  
#  #the situation when theta is 1 and eta is 10
#  for (i in 1:length(x2))
#    values2[i] <- integrate(f, lower=-Inf,upper=x2[i],rel.tol=.Machine$double.eps^0.25,theta=1, eta=10)$value
#  data2 <- data.frame(value=values2,truevalues=pcauchy(x2,location=10,scale=1),error=values2-pcauchy(x2,location=10,scale=1))
#  data2[1:20,]
#  #compare the cauchy distributions,one of them is used integrate and the other is used pcauchy
#  plot(x2,values2,pch=21,xlab="the values of x", ylab="the cauchy distribution function ",main = expression("the compare for Cauchy distribution when \ntheta is 1 and eta is 10 with two methods"))
#  lines(x2,pcauchy(x2,location = 10,scale = 1),col='red',lwd=2)
#  
#  
#  #The answer of second question:
#  library(nloptr)
#  # the maximum likehood function
#  f <- function(x,x1,nA=28,nB=24,nOO=41,nAB=70) {
#    r1<-1-sum(x1)
#    nAA<-nA*x1[1]^2/(x1[1]^2+2*x1[1]*r1)
#    nBB<-nB*x1[2]^2/(x1[2]^2+2*x1[2]*r1)
#    r<-1-sum(x)
#    return(-2*nAA*log(x[1])-2*nBB*log(x[2])-2*nOO*log(r)-
#             (nA-nAA)*log(2*x[1]*r)-(nB-nBB)*log(2*x[2]*r)-nAB*log(2*x[1]*x[2]))
#  }
#  
#  # constraint function
#  g <- function(x,x1,nA=28,nB=24,nOO=41,nAB=70) {
#    return(sum(x)-0.999999)
#  }
#  
#  opts <- list("algorithm"="NLOPT_LN_COBYLA",
#               "xtol_rel"=1.0e-8)
#  mle<-NULL
#  r<-matrix(0,1,2)
#  r<-rbind(r,c(0.2,0.35))# the beginning value of p0 and q0
#  j<-2
#  while (sum(abs(r[j,]-r[j-1,]))>1e-8) {
#  res <- nloptr( x0=c(0.3,0.25),
#                 eval_f=f,
#                 lb = c(0,0), ub = c(1,1),
#                 eval_g_ineq = g,
#                 opts = opts, x1=r[j,],nA=28,nB=24,nOO=41,nAB=70 )
#  j<-j+1
#  r<-rbind(r,res$solution)
#  mle<-c(mle,f(x=r[j,],x1=r[j-1,]))
#  }
#  r  #the result of EM algorithm
#  mle #the max likelihood values
#  

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  formulas <- list(
#    mpg ~ disp,
#    mpg ~ I(1 / disp),
#    mpg ~ disp + wt,
#    mpg ~ I(1 / disp) + wt
#  )
#  lapply(formulas,lm,data=mtcars)
#  out <- vector("list", length(formulas))
#  for (i in seq_along(formulas)) {
#  out[[i]] <- lm(formulas[[i]], data = mtcars)
#  }
#  out
#  
#  #The answer of second question:
#  set.seed(1)
#  bootstraps <- lapply(1:10, function(i) {
#    rows <- sample(1:nrow(mtcars), rep = TRUE)
#    mtcars[rows, ]
#  })
#  getvalue <- function(x){
#    vars <- c('mpg','disp')
#    x[vars]
#  }
#  data <- lapply(bootstraps, getvalue)
#  lapply(data,lm)
#  set.seed(1)
#  out1 <- vector("list", length(bootstraps))
#  for (i in seq_along(bootstraps)) {
#  out1[[i]] <- getvalue(bootstraps[[i]])
#  }
#  
#  out2 <- vector("list", length(out1))
#  for (i in seq_along(out1)) {
#  out2[[i]] <- lm(out1[[i]])
#  }
#  out2
#  
#  
#  #The answer of third question:
#  formulas <- list(
#    mpg ~ disp,
#    mpg ~ I(1 / disp),
#    mpg ~ disp + wt,
#    mpg ~ I(1 / disp) + wt
#  )
#  funclist <- lapply(formulas,lm,data=mtcars)
#  rsq <- function(mod) summary(mod)$r.squared
#  lapply(funclist, rsq)
#  getvalue <- function(x){
#    vars <- c('mpg','disp')
#    x[vars]
#  }
#  data <- lapply(bootstraps, getvalue)
#  data1 <- lapply(data,lm)
#  rsq <- function(mod) summary(mod)$r.squared
#  unlist(lapply(data1, rsq))
#  
#  
#  #The answer of fourth question:
#  trials <- replicate(
#    100,
#    t.test(rpois(10, 10), rpois(7, 10)),
#    simplify = FALSE
#  )
#  #use an anonymous function
#  sapply(trials, function(mod) mod$p.value)
#  
#  
#  #The answer of fifth question:
#  a <- c(1,2,3,4,5)
#  b <- c(6,7,8,9,10)
#  c <- c(2,4,6,8,10)
#  d <- data.frame(a,b,c)
#  #the lapply1 function can parallelly get the sum of the input vectors
#  lapply1 <- function(x,...){
#    args <- list(x,...)
#    for (a in args) x <- x+a
#    x
#  }
#  lapply1(a,b,c)
#  lapply1(d[1],d[2],d[3])
#  #the lapply2 function can parallelly get the product of the input vectors
#  lapply2 <- function(x,...){
#    args <- list(...)
#    for (a in args) x <- x*a
#    x
#  }
#  lapply2(a,b,c)

## ----eval=FALSE----------------------------------------------------------
#  #The answer of first question:
#  chisq.test1<- function (x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)),
#            rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)
#  {
#    DNAME <- deparse(substitute(x))
#    #get the total sum of x
#    n <- sum(x)
#    if (is.matrix(x)) {
#      METHOD <- "changed Pearson's Chi-squared test"
#      nr <- as.integer(nrow(x))
#      nc <- as.integer(ncol(x))
#      if (is.na(nr) || is.na(nc) || is.na(nr * nc))
#        stop("invalid nrow(x) or ncol(x)", domain = NA)
#      #get the rowsums of x
#      sr <- rowSums(x)
#      #get the colsums of x
#      sc <- colSums(x)
#      #The sum of rows and columns that's sr,sc divided by n
#      E <- outer(sr, sc, "*")/n
#      v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3
#      V <- outer(sr, sc, v, n)
#      dimnames(E) <- dimnames(x)
#      #the situation of the sum of rows and columns are all not equal 0
#      if (simulate.p.value && all(sr > 0) && all(sc > 0)) {
#        setMETH()
#        tmp <- .Call(C_chisq_sim, sr, sc, B, E)
#        STATISTIC <- sum(sort((x - E)^2/E, decreasing = TRUE))
#        PARAMETER <- NA
#        PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC))/(B +
#                                                          1)
#      }
#      else {
#        if (simulate.p.value)
#          warning("cannot compute simulated p-value with zero marginals")
#        if (correct && nrow(x) == 2L && ncol(x) == 2L) {
#          YATES <- min(0.5, abs(x - E))
#          if (YATES > 0)
#            METHOD <- paste(METHOD, "with Yates' continuity correction")
#        }
#        else YATES <- 0
#        STATISTIC <- sum((abs(x - E) - YATES)^2/E)
#        PARAMETER <- (nr - 1L) * (nc - 1L)
#        PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
#      }
#    }
#    names(STATISTIC) <- "X-squared"
#    names(PARAMETER) <- "df"
#    if (any(E < 5) && is.finite(PARAMETER))
#      warning("Chi-squared approximation may be incorrect")
#    #output the result
#    structure(list(statistic = STATISTIC, parameter = PARAMETER,
#                   p.value = PVAL, method = METHOD, data.name = DNAME, observed = x,
#                   expected = E, residuals = (x - E)/sqrt(E), stdres = (x -
#                                                                          E)/sqrt(V)), class = "htest")
#  }
#  library(vcd)
#  Treatment <- Arthritis$Treatment
#  Improved <- Arthritis$Improved
#  mytable <- table(Treatment,Improved)
#  #you can see the changed chisq.test1's output is the same as the original function
#  chisq.test(mytable)
#  chisq.test1(mytable)
#  
#  #compare the calculate speed
#  set.seed(1)
#  x <- sample(1e5,1e6,10)
#  y <- sample(1e5,1e6,10)
#  system.time(chisq.test(rbind(x,y)))
#  system.time(chisq.test1(rbind(x,y)))
#  
#  
#  #The answer of second question:
#  table2 <- function (... , dnn = list.names(...), deparse.level = 1)
#  {
#    list.names <- function(...) {
#      l <- as.list(substitute(list(...)))[-1L]
#      nm <- names(l)
#      fixup <- if (is.null(nm))
#        seq_along(l)
#      else nm == ""
#      dep <- vapply(l[fixup], function(x) switch(deparse.level +
#                                                   1, "", if (is.symbol(x)) as.character(x) else "",
#                                                 deparse(x, nlines = 1)[1L]), "")
#      if (is.null(nm))
#        dep
#      else {
#        nm[fixup] <- dep
#        nm
#      }
#    }
#  
#    args <- list(...)
#    if (length(args) == 1L && is.list(args[[1L]])) {
#      args <- args[[1L]]
#      if (length(dnn) != length(args))
#        dnn <- if (!is.null(argn <- names(args)))
#          argn
#      else paste(dnn[1L], seq_along(args), sep = ".")
#    }
#    bin <- 0L
#    lens <- NULL
#    dims <- integer()
#    pd <- 1L
#    dn <- NULL
#    for (a in args) {
#      if (is.null(lens))
#        lens <- length(a)
#      else if (length(a) != lens)
#        stop("all arguments must have the same length")
#      fact.a <- is.factor(a)
#      if (!fact.a) {
#        a0 <- a
#        a <- factor(a)
#      }
#      ll <- levels(a)
#      a <- as.integer(a)
#      nl <- length(ll)
#      dims <- c(dims, nl)
#      if (prod(dims) > .Machine$integer.max)
#        stop("attempt to make a table with >= 2^31 elements")
#      dn <- c(dn, list(ll))
#      bin <- bin + pd * (a - 1L)
#      pd <- pd * nl
#    }
#    names(dn) <- dnn
#    bin <- bin[!is.na(bin)]
#    if (length(bin))
#      bin <- bin + 1L
#    y <- array(tabulate(bin, pd), dims, dimnames = dn)
#    class(y) <- "table"
#    y
#  }
#  set.seed(1)
#  z <- sample(1:50,1e7,replace = TRUE)
#  system.time(table(z))
#  system.time(table2(z))
Panxiaoba/StatComp18013 documentation built on May 5, 2019, 11:06 p.m.