vignette.R
In ipsRdbs: Introduction to Probability, Statistics and R for Data-Based Sciences

## ----style, echo = FALSE, results = 'asis'------------------------------------
  BiocStyle::markdown()

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  class.source="watch-out",
  comment = "#>")

## ----setup,  eval=TRUE, echo=FALSE, include=FALSE-----------------------------
library(ggplot2)
require(ipsRdbs)
knitr::opts_chunk$set(eval = FALSE)

figpath <- system.file("figures", package = "ipsRdbs") 
print(figpath)

## ----coverplot, eval=TRUE, echo=FALSE, fig.cap="ipsRdbs book cover", fig.width =1.5, fig.height=2.5----
library(ipsRdbs)
figpath <- system.file("figures", package = "ipsRdbs") 
knitr::include_graphics(paste0(figpath, "/cover.jpg"))

## ----echo=TRUE, eval=FALSE----------------------------------------------------
#  install.packages("ipsRdbs", dependencies=TRUE)

## -----------------------------------------------------------------------------
#  library(ipsRdbs)
#  ls("package:ipsRdbs")

## ----eval=TRUE----------------------------------------------------------------
head(beanie)
summary(beanie)
plot(beanie$age, beanie$value, xlab="Age", ylab="Value", pch="*", col="red")

## ----bill, eval=TRUE----------------------------------------------------------
head(bill)
summary(bill)
library(ggplot2)
gg <- ggplot2::ggplot(data=bill, aes(x=age, y=wealth)) +
geom_point(aes(col=region, size=wealth)) +
geom_smooth(method="loess", se=FALSE) +
xlim(c(7, 102)) +
ylim(c(1, 37)) +
labs(subtitle="Wealth vs Age of Billionaires",
y="Wealth (Billion US $)", x="Age",
title="Scatterplot", caption = "Source: Fortune Magazine, 1992.")
plot(gg)

## ----bodyfat, eval=TRUE-------------------------------------------------------
summary(bodyfat)
plot(bodyfat$Skinfold,  bodyfat$Bodyfat, xlab="Skin", ylab="Fat")
plot(bodyfat$Skinfold,  log(bodyfat$Bodyfat), xlab="Skin", ylab="log Fat")
plot(log(bodyfat$Skinfold),  log(bodyfat$Bodyfat), xlab="log Skin", ylab="log Fat")
# Keep the transformed variables in the data set 
bodyfat$logskin <- log(bodyfat$Skinfold)
bodyfat$logbfat <- log(bodyfat$Bodyfat)
bodyfat$logskin <- log(bodyfat$Skinfold)
 # Create a grouped variable 
bodyfat$cutskin <- cut(log(bodyfat$Skinfold), breaks=6) 
boxplot(data=bodyfat, Bodyfat~cutskin, col=2:7)
require(ggplot2)
p2 <- ggplot(data=bodyfat, aes(x=cutskin, y=logbfat)) + 
geom_boxplot(col=2:7) + 
stat_summary(fun=mean, geom="line", aes(group=1), col="blue", linewidth=1) +
labs(x="Skinfold", y="Percentage of log bodyfat", 
title="Boxplot of log-bodyfat percentage vs grouped log-skinfold")  
plot(p2)

n <- nrow(bodyfat)
x <- bodyfat$logskin
y <- bodyfat$logbfat
xbar <- mean(x)
ybar <- mean(y)
sx2 <- var(x)
sy2 <- var(y)
sxy <- cov(x, y)
r <- cor(x, y)
print(list(n=n, xbar=xbar, ybar=ybar, sx2=sx2, sy2=sy2, sxy=sxy, r=r))
hatbeta1 <- r * sqrt(sy2/sx2) # calculates estimate of the slope
hatbeta0 <- ybar - hatbeta1 * xbar # calculates estimate of the intercept
rs <-  y - hatbeta0 - hatbeta1 * x  # calculates residuals
s2 <- sum(rs^2)/(n-2)  # calculates estimate of sigma2
s2
bfat.lm <- lm(logbfat ~ logskin, data=bodyfat)
### Check the diagnostics 
plot(bfat.lm$fit, bfat.lm$res, xlab="Fitted values", ylab = "Residuals", pch="*")
abline(h=0)
### Should be a random scatter
qqnorm(bfat.lm$res, col=2)
qqline(bfat.lm$res, col="blue")

# All Points should be on the straight line 
summary(bfat.lm)
anova(bfat.lm)
plot(bodyfat$logskin,  bodyfat$logbfat, xlab="log Skin", ylab="log Fat")
abline(bfat.lm, col=7)
title("Scatter plot with the fitted Linear Regression line")
# 95% CI for beta(1)
# 0.88225 + c(-1, 1) * qt(0.975, df=100) *  0.02479 
# round(0.88225 + c(-1, 1) * qt(0.975, df=100) *  0.02479, 2)
# To test H0: beta1 = 1. 
tstat <- (0.88225 -1)/0.02479 
pval <- 2 * (1- pt(abs(tstat), df=100))
x <- seq(from=-5, to=5, length=500)
y <- dt(x, df=100)
plot(x, y,  xlab="", ylab="", type="l")
title("T-density with df=100")
abline(v=abs(tstat))
abline(h=0)
x1 <- seq(from=abs(tstat), to=10, length=100)
y1 <- rep(0, length=100)
x2 <- x1
y2 <- dt(x1, df=100)
segments(x1, y1, x2, y2)
abline(h=0)
# Predict at a new value of Skinfold=70
# Create a new data set called new
newx <- data.frame(logskin=log(70))
a <- predict(bfat.lm, newdata=newx, se.fit=TRUE) 
# Confidence interval for the mean of log bodyfat  at skinfold=70
a <- predict(bfat.lm, newdata=newx, interval="confidence") 
# a
#          fit      lwr     upr
# [1,] 2.498339 2.474198 2.52248
# Prediction interval for a future log bodyfat  at skinfold=70
a <- predict(bfat.lm, newdata=newx, interval="prediction") 
a
#          fit      lwr      upr
# [1,] 2.498339 2.333783 2.662895
#prediction intervals for the mean 
pred.bfat.clim <- predict(bfat.lm, data=bodyfat, interval="confidence")
#prediction intervals for future observation
pred.bfat.plim <- suppressWarnings(predict(bfat.lm, data=bodyfat, interval="prediction"))
plot(bodyfat$logskin,  bodyfat$logbfat, xlab="log Skin", ylab="log Fat")
abline(bfat.lm, col=5)
lines(log(bodyfat$Skinfold), pred.bfat.clim[,2], lty=2, col=2) 
lines(log(bodyfat$Skinfold), pred.bfat.clim[,3], lty=2, col=2) 
lines(log(bodyfat$Skinfold), pred.bfat.plim[,2], lty=4, col=3) 
lines(log(bodyfat$Skinfold), pred.bfat.plim[,3], lty=4, col=3) 
title("Scatter plot with the fitted line and prediction intervals")
symb <- c("Fitted line", "95% CI for mean", "95% CI for observation")
## legend(locator(1), legend = symb, lty = c(1, 2, 4), col = c(5, 2, 3))
# Shows where we predicted earlier 
abline(v=log(70))
summary(bfat.lm)
anova(bfat.lm)

## ----bombhits, eval=TRUE------------------------------------------------------
 summary(bombhits)
 # Create a vector of data 
 x <- c(rep(0, 229), rep(1, 211), rep(2, 93), rep(3, 35), rep(4, 7), 5)
 y <- c(229, 211, 93, 35, 7, 1) # Frequencies 
 rel_freq <- y/576
 xbar <- mean(x)
 pois_prob <- dpois(x=0:5, lambda=xbar)
 fit_freq <- pois_prob * 576
  #Check 
  cbind(x=0:5, obs_freq=y, rel_freq =round(rel_freq, 4),  
  Poisson_prob=round(pois_prob, 4), fit_freq=round(fit_freq, 1))
  obs_freq <- y
  xuniques <- 0:5
  a <- data.frame(xuniques=0:5, obs_freq =y, fit_freq=fit_freq)
  barplot(rbind(obs_freq, fit_freq), 
  args.legend = list(x = "topright"), 
  xlab="No of bomb hits",  
  names.arg = xuniques,  beside=TRUE, 
  col=c("darkblue","red"), 
  legend =c("Observed", "Fitted"), 
  main="Observed and Poisson distribution fitted frequencies 
  for the number of bomb hits in London")

## ----eval=TRUE----------------------------------------------------------------
summary(cement)
boxplot(data=cement, strength~gauger, col=1:3)
boxplot(data=cement, strength~breaker, col=1:3)

## ----eval=TRUE, message=FALSE, warning=FALSE----------------------------------
summary(cheese)
pairs(cheese)
GGally::ggpairs(data=cheese)
cheese.lm <- lm(Taste ~ AceticAcid +  LacticAcid + logH2S, data=cheese, subset=2:30)
 # Check the diagnostics 
 plot(cheese.lm$fit, cheese.lm$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)
 # Should be a random scatter
 qqnorm(cheese.lm$res, col=2)
 qqline(cheese.lm$res, col="blue")
 summary(cheese.lm)
 cheese.lm2 <- lm(Taste ~ LacticAcid + logH2S, data=cheese)
 # Check the diagnostics 
 plot(cheese.lm2$fit, cheese.lm2$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)
 qqnorm(cheese.lm2$res, col=2)
 qqline(cheese.lm2$res, col="blue")
 summary(cheese.lm2)
 # How can we predict? 
 newcheese <- data.frame(AceticAcid = 300, LacticAcid = 1.5, logH2S=4)
 cheese.pred <- predict(cheese.lm2, newdata=newcheese, se.fit=TRUE)
 cheese.pred
 # Obtain confidence interval 
 cheese.pred$fit + c(-1, 1) * qt(0.975, df=27) * cheese.pred$se.fit
 # Using R to predict  
 cheese.pred.conf.limits <- predict(cheese.lm2, newdata=newcheese, interval="confidence")
 cheese.pred.conf.limits
 # How to find prediction interval 
 cheese.pred.pred.limits <- predict(cheese.lm2, newdata=newcheese, interval="prediction")
 cheese.pred.pred.limits

## ----emissions, eval=TRUE-----------------------------------------------------
summary(emissions)
 
 rawdata <- emissions[, c(8, 4:7)]
 pairs(rawdata)
# Fit the model on the raw scale 
raw.lm <- lm(ADR37 ~ ADR27 + CS505  + T867 + H505, data=rawdata) 
old.par <- par(no.readonly = TRUE)
par(mfrow=c(2,1))
plot(raw.lm$fit, raw.lm$res,xlab="Fitted values",ylab="Residuals", main="Anscombe plot") 
abline(h=0)
qqnorm(raw.lm$res,main="Normal probability plot", col=2)
qqline(raw.lm$res, col="blue")
# summary(raw.lm)
logdata <- log(rawdata)
# This only logs the values but not the column names!
# We can use the following command to change the column names or you can use
# fix(logdata) to do it. 
dimnames(logdata)[[2]] <- c("logADR37", "logCS505", "logT867", "logH505", "logADR27")
pairs(logdata)
log.lm <- lm(logADR37 ~ logADR27 + logCS505  + logT867 + logH505, data=logdata) 
plot(log.lm$fit, log.lm$res,xlab="Fitted values",ylab="Residuals", main="Anscombe plot") 
abline(h=0)
qqnorm(log.lm$res,main="Normal probability plot", col=2)
qqline(log.lm$res, col="blue")
summary(log.lm)
log.lm2 <- lm(logADR37 ~ logADR27 + logT867 + logH505, data=logdata) 
summary(log.lm2)
plot(log.lm2$fit, log.lm2$res,xlab="Fitted values",ylab="Residuals", main="Anscombe plot") 
abline(h=0)
qqnorm(log.lm2$res,main="Normal probability plot", col=2)
qqline(log.lm2$res, col="blue")
par(old.par)
#####################################
# Multicollinearity Analysis 
######################################
mod.adr27 <-  lm(logADR27 ~ logT867 + logCS505 + logH505, data=logdata) 
summary(mod.adr27) # Multiple R^2 = 0.9936,
mod.t867 <-  lm(logT867 ~ logADR27 + logH505 + logCS505, data=logdata)  
summary(mod.t867) # Multiple R^2 = 0.977,
mod.cs505 <-  lm(logCS505 ~ logADR27 + logH505 + logT867, data=logdata)  
summary(mod.cs505) # Multiple R^2 = 0.9837,
mod.h505 <-  lm(logH505 ~ logADR27 + logCS505 + logT867, data=logdata)  
summary(mod.h505) # Multiple R^2 = 0.5784,
# Variance inflation factors 
vifs <- c(0.9936, 0.977, 0.9837, 0.5784)
vifs <- 1/(1-vifs) 
#Condition numbers 
X <- logdata 
# X is a copy of logdata 
X[,1] <- 1
# the first column of X is 1
# this is for the intercept 
X <- as.matrix(X) 
# Coerces X to be a matrix
xtx <- t(X) %*% X # Gives X^T X
eigenvalues <- eigen(xtx)$values
kappa <- max(eigenvalues)/min(eigenvalues)
kappa <- sqrt(kappa)
# kappa = 244 is much LARGER than 30!

### Validation statistic
# Fit the log.lm2 model with the first 45 observations  
# use the fitted model to predict the remaining 9 observations 
# Calculate the mean square error validation statistic 
log.lmsub <- lm(logADR37 ~ logADR27 + logT867 + logH505, data=logdata, subset=1:45) 
# Now predict all 54 observations using the fitted model
mod.pred <- predict(log.lmsub, logdata, se.fit=TRUE) 
mod.pred$fit # provides all the 54 predicted values 
logdata$pred <- mod.pred$fit
# Get only last 9 
a <- logdata[46:54, ]
validation.residuals <- a$logADR37 - a$pred  
validation.stat <- mean(validation.residuals^2)
validation.stat

## ----eval=TRUE----------------------------------------------------------------
summary(err_age)

## ----ffood, eval=TRUE---------------------------------------------------------
summary(ffood)
 # 95% Confidence interval for the mean waiting time usig t-distribution
 a <- c(ffood$AM, ffood$PM)
 mean(a) + c(-1, 1) * qt(0.975, df=19) * sqrt(var(a))/sqrt(20) 
 # Two sample t-test for the difference between morning and afternoon times
 t.test(ffood$AM, ffood$PM)

## ----gasmileage , eval=TRUE---------------------------------------------------
summary(gasmileage)
y <- c(22, 26,  28, 24, 29,   29, 32, 28,  23, 24)
xx <- c(1,1,2,2,2,3,3,3,4,4)
# Plot the observations 
plot(xx, y, col="red", pch="*", xlab="Model", ylab="Mileage")
# Method1: Hand calculation 
ni <- c(2, 3, 3, 2)
means <- tapply(y, xx, mean)
vars <- tapply(y, xx, var)
round(rbind(means, vars), 2)
sum(y^2) # gives 7115
totalSS <- sum(y^2) - 10 * (mean(y))^2 # gives 92.5 
RSSf <- sum(vars*(ni-1)) # gives 31.17 
groupSS <- totalSS - RSSf # gives 61.3331.17/6
meangroupSS <- groupSS/3 # gives 20.44
meanErrorSS <- RSSf/6 # gives 5.194
Fvalue <- meangroupSS/meanErrorSS # gives 3.936 
pvalue <- 1-pf(Fvalue, df1=3, df2=6)

#### Method 2: Illustrate using dummy variables
#################################
#Create the design matrix X for the full regression model
g <- 4
n1 <- 2 
n2 <- 3
n3 <- 3
n4 <- 2
n <- n1+n2+n3+n4
X <- matrix(0, ncol=g, nrow=n)       #Set X as a zero matrix initially
X[1:n1,1] <- 1    #Determine the first column of X
X[(n1+1):(n1+n2),2] <- 1   #the 2nd column
X[(n1+n2+1):(n1+n2+n3),3] <- 1    #the 3rd
X[(n1+n2+n3+1):(n1+n2+n3+n4),4] <- 1    #the 4th 
#################################
####Fitting the  full model####
#################################
#Estimation
XtXinv <- solve(t(X)%*%X)
betahat <- XtXinv %*%t(X)%*%y   #Estimation of the coefficients
Yhat <- X%*%betahat   #Fitted Y values
Resids <- y - Yhat   #Residuals
SSE <- sum(Resids^2)   #Error sum of squares
S2hat <- SSE/(n-g)   #Estimation of sigma-square; mean square for error
Sigmahat <- sqrt(S2hat)

##############################################################
####Fitting the reduced model -- the 4 means are equal #####
##############################################################
Xr <- matrix(1, ncol=1, nrow=n)
kr <- dim(Xr)[2]
#Estimation
Varr <- solve(t(Xr)%*%Xr)
hbetar <- solve(t(Xr)%*%Xr)%*%t(Xr)%*% y   #Estimation of the coefficients
hYr = Xr%*%hbetar   #Fitted Y values
Resir <- y - hYr   #Residuals
SSEr <- sum(Resir^2)   #Total sum of squares
###################################################################
####F-test for comparing the reduced model with the full model ####
###################################################################
FStat <- ((SSEr-SSE)/(g-kr))/(SSE/(n-g))  #The test statistic of the F-test
alpha <- 0.05
Critical_value_F <- qf(1-alpha, g-kr,n-g)  #The critical constant of F-test
pvalue_F <- 1-pf(FStat,g-kr, n-g)   #p-value of F-test

modelA <- c(22, 26)
modelB <- c(28, 24, 29)
modelC <- c(29, 32, 28)
modelD <- c(23, 24)

SSerror = sum( (modelA-mean(modelA))^2 ) + sum( (modelB-mean(modelB))^2 ) 
+ sum( (modelC-mean(modelC))^2 ) + sum( (modelD-mean(modelD))^2 )
SStotal <-  sum( (y-mean(y))^2 ) 
SSgroup <- SStotal-SSerror

####
#### Method 3: Use the  built-in function lm directly

#####################################
aa <- "modelA"
bb <- "modelB"
cc <- "modelC"
dd <- "modelD"
Expl <- c(aa,aa,bb,bb,bb,cc,cc,cc,dd,dd)
is.factor(Expl)
Expl <- factor(Expl)
model1 <- lm(y~Expl)
summary(model1)      
anova(model1)
###Alternatively ###

xxf <- factor(xx)
is.factor(xxf)
model2 <- lm(y~xxf)
summary(model2)
anova(model2)

## ----possum, eval=TRUE--------------------------------------------------------
 head(possum)
 dim(possum)
 summary(possum)
 ## Code lines used in the book
 ## Create a new data set   
 poss <- possum 
 poss$region <- factor(poss$Location)
 levels(poss$region) <- c("W.A", "S.A", "N.T", "QuL", "NSW", "Vic", "Tas")
 colnames(poss)<-c("y","z","Location", "x")
 head(poss)
 # Draw side by side boxplots 
 boxplot(y~x, data=poss, col=2:8, xlab="region", ylab="Weight")
 # Obtain scatter plot 
 # Start with a skeleton plot 
 plot(poss$z, poss$y, type="n", xlab="Length", ylab="Weight")
 # Add points for the seven regions
 for (i in 1:7) {
    points(poss$z[poss$Location==i],poss$y[poss$Location==i],type="p", pch=as.character(i), col=i)
    }
## Add legends 
 legend(x=76, y=4.2, legend=paste(as.character(1:7), levels(poss$x)),  lty=1:7, col=1:7)
 # Start  modelling 
 #Fit the model with interaction. 
 poss.lm1<-lm(y~z+x+z:x,data=poss)
 summary(poss.lm1)
 plot(poss$z, poss$y,type="n", xlab="Length", ylab="Weight")
 for (i in 1:7) {
 lines(poss$z[poss$Location==i],poss.lm1$fit[poss$Location==i],type="l",
 lty=i, col=i, lwd=1.8)
 points(poss$z[poss$Location==i],poss$y[poss$Location==i],type="p",
 pch=as.character(i), col=i)
 }
 poss.lm0 <- lm(y~z,data=poss)
 abline(poss.lm0, lwd=3, col=9)
 # Has drawn the seven parallel regression lines
 legend(x=76, y=4.2, legend=paste(as.character(1:7), levels(poss$x)), 
 lty=1:7, col=1:7)
 
 n <- length(possum$Body_Weight)
 # Wrong model since Location is not a numeric covariate 
 wrong.lm <- lm(Body_Weight~Location, data=possum)
 summary(wrong.lm)
 
 nis <- table(possum$Location)
 meanwts <- tapply(possum$Body_Weight, possum$Location, mean)
 varwts <- tapply(possum$Body_Weight, possum$Location, var)
 datasums <- data.frame(nis=nis, mean=meanwts, var=varwts)
 datasums <- data.frame(nis=nis, mean=meanwts, var=varwts)
 modelss <- sum(datasums[,2] * (meanwts - mean(meanwts))^2)
 residss <- sum( (datasums[,2] - 1) * varwts)
 
 fvalue <- (modelss/6) / (residss/94)
 fcritical <- qf(0.95, df1= 6, df2=94)
 x <- seq(from=0, to=12, length=200)
 y <- df(x, df1=6, df2=94)
 plot(x, y, type="l", xlab="", ylab="Density of F(6, 94)", col=4)
 abline(v=fcritical, lty=3, col=3)
 abline(v=fvalue, lty=2, col=2)
 pvalue <- 1-pf(fvalue, df1=6, df2=94)
 
 ### Doing the above in R
 # Convert  the Location column to a factor
 
 localpossum <- possum
 localpossum$Location <- as.factor(localpossum$Location)
 summary(localpossum)  # Now Location is a factor 
  
 # Put the identifiability constraint:
 options(contrasts=c("contr.treatment", "contr.poly"))
 # Change to have easier column names 
 colnames(localpossum) <- c("y", "z", "x")
 # Fit model M1
 possum.lm1 <- lm(y~x, data=localpossum)
 summary(possum.lm1)
 anova(possum.lm1)
 possum.lm2 <- lm(y~z, data=localpossum)
 summary(possum.lm2)
 anova(possum.lm2)
 # Include both location and length but no interaction 
 possum.lm3 <-  lm(y~x+z, data=localpossum)
 summary(possum.lm3)
 anova(possum.lm3)
 # Include interaction effect 
 possum.lm4 <-  lm(y~x+z+x:z, data=localpossum)
 summary(possum.lm4)
 anova(possum.lm4)
 anova(possum.lm2, possum.lm3)
 #Check the diagnostics for M3
 plot(possum.lm3$fit, possum.lm3$res,xlab="Fitted values",ylab="Residuals", 
 main="Anscombe plot")
 abline(h=0)
 qqnorm(possum.lm3$res,main="Normal probability plot", col=2)
 qqline(possum.lm3$res, col="blue")
 rm(localpossum)
 rm(poss)

## ----puffin, eval=TRUE--------------------------------------------------------
head(puffin)
dim(puffin)
summary(puffin)
pairs(puffin)
puffin$sqrtfreq <- sqrt(puffin$Nesting_Frequency)
puff.sqlm <- lm(sqrtfreq~ Grass_Cover + Mean_Soil_Depth + Slope_Angle 
+Distance_from_Edge, data=puffin) 
old.par <- par(no.readonly = TRUE)
par(mfrow=c(2,1))
qqnorm(puff.sqlm$res,main="Normal probability plot", col=2)
qqline(puff.sqlm$res, col="blue")
plot(puff.sqlm$fit, puff.sqlm$res,xlab="Fitted values",ylab="Residuals", 
main="Anscombe plot", col="red")
abline(h=0)
summary(puff.sqlm)
par(old.par)
#####################################
# F test for two betas at the  same time: 
######################################
puff.sqlm2 <- lm(sqrtfreq~ Mean_Soil_Depth + Distance_from_Edge, data=puffin) 
anova(puff.sqlm)
anova(puff.sqlm2)
fval <-  1/2*(14.245-12.756)/0.387 # 1.924 
qf(0.95, 2, 33) # 3.28
1-pf(fval, 2, 33) # 0.162
anova(puff.sqlm2, puff.sqlm)

## ----rice, eval=TRUE----------------------------------------------------------
 summary(rice)
 plot(rice$Days, rice$Yield, pch="*", xlab="Days", ylab="Yield")
 rice$daymin31 <- rice$Days-31
 rice.lm <- lm(Yield ~ daymin31, data=rice)
 summary(rice.lm)
 # Check the diagnostics 
 plot(rice.lm$fit, rice.lm$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)
 # Should be a random scatter
 # Needs a quadratic term
 
 qqnorm(rice.lm$res, col=2)
 qqline(rice.lm$res, col="blue")
 rice.lm2 <- lm(Yield ~ daymin31 + I(daymin31^2) , data=rice)
 old.par <- par(no.readonly = TRUE)
 par(mfrow=c(1, 2))
 plot(rice.lm2$fit, rice.lm2$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)
 # Should be a random scatter 
 # Much better plot!
 qqnorm(rice.lm2$res, col=2)
 qqline(rice.lm2$res, col="blue")
 summary(rice.lm2)
 par(old.par) # par(mfrow=c(1,1))
 plot(rice$Days,  rice$Yield, xlab="Days", ylab="Yield")
 lines(rice$Days, rice.lm2$fit, lty=1, col=3)
 rice.lm3 <- lm(Yield ~ daymin31 + I(daymin31^2)+I(daymin31^3) , data=rice)
 #check the diagnostics 
 summary(rice.lm3) # Will print the summary of the fitted model 
 #### Predict at a new value of Days=31.1465
 
 # Create a new data set called new
 new <- data.frame(daymin31=32.1465-31)
 
 a <- predict(rice.lm2, newdata=new, se.fit=TRUE) 
 # Confidence interval for the mean of rice yield  at day=31.1465
 a <- predict(rice.lm2, newdata=new, interval="confidence") 
 a
 #          fit      lwr      upr
 # [1,] 3676.766 3511.904 3841.628
 # Prediction interval for a future yield at day=31.1465
 b <- predict(rice.lm2, newdata=new, interval="prediction") 
 b
 # fit      lwr      upr
 #[1,] 3676.766 3206.461 4147.071

## ----wgain, eval=TRUE---------------------------------------------------------
summary(wgain)
# 95% Confidence interval for mean weight gain 
x <- wgain$final - wgain$initial
mean(x) + c(-1, 1) * qt(0.975, df=67) * sqrt(var(x)/68)
# t-test to test the mean difference equals 0
t.test(x)

## ----butterfly, eval=TRUE-----------------------------------------------------
butterfly(color = 6)
old.par <- par(no.readonly = TRUE)
par(mfrow=c(2, 2))
butterfly(a=10, b=1.5, color = "seagreen")
butterfly(color = 6)
butterfly(a=5, b=5, color=2)
butterfly(a=20, b=4, color = "blue")
par(old.par) # par(mfrow=c(1, 1))

## ----Montyplot, eval=TRUE, echo=FALSE, fig.cap="Monty Hall game", fig.width =2, fig.height=1----
library(ipsRdbs)
figpath <- system.file("figures", package = "ipsRdbs") 
knitr::include_graphics(paste0(figpath, "/monty.png"))

## -----------------------------------------------------------------------------
#  monty("stay", print_games = FALSE)
#  monty("switch", print_games = FALSE)
#  monty("random", print_games = FALSE)

## ----cltforBernoulli, eval=TRUE-----------------------------------------------
a <- see_the_clt_for_Bernoulli()
old.par <- par(no.readonly = TRUE)
par(mfrow=c(2, 3))
a30 <- see_the_clt_for_Bernoulli(nsize=30)
a50 <- see_the_clt_for_Bernoulli(nsize=50)
a100 <- see_the_clt_for_Bernoulli(nsize=100)
a500 <- see_the_clt_for_Bernoulli(nsize=500)
a1000 <- see_the_clt_for_Bernoulli(nsize=1000)
a5000 <- see_the_clt_for_Bernoulli(nsize=5000)
par(old.par)

## ----seethecltforuniform, eval=TRUE-------------------------------------------
a <- see_the_clt_for_uniform()
old.par <- par(no.readonly = TRUE) 
par(mfrow=c(2, 3))
a1 <- see_the_clt_for_uniform(nsize=1)
a2 <- see_the_clt_for_uniform(nsize=2)
a3 <- see_the_clt_for_uniform(nsize=5)
a4 <- see_the_clt_for_uniform(nsize=10)
a5 <- see_the_clt_for_uniform(nsize=20)
a6 <- see_the_clt_for_uniform(nsize=50)
par(old.par)
ybars <- see_the_clt_for_uniform(nsize=12)
zbars <- (ybars - mean(ybars))/sd(ybars)
k <- 100
u <- seq(from=min(zbars), to= max(zbars), length=k)
ecdf <-  rep(NA, k)
for(i in 1:k) ecdf[i] <- length(zbars[zbars<u[i]])/length(zbars)
tcdf <- pnorm(u)
plot(u, tcdf, type="l", col="red", lwd=4, xlab="", ylab="cdf")
lines(u, ecdf, lty=2, col="darkgreen", lwd=4)
symb <- c("cdf of sample means", "cdf of N(0, 1)")
legend(x=-3.5, y=0.4, legend = symb, lty = c(2, 1), 
col = c("darkgreen","red"), bty="n")

## ----seethewlln, eval=TRUE----------------------------------------------------
a1 <- see_the_wlln_for_uniform(nsize=1, nrep=50000)
a2 <- see_the_wlln_for_uniform(nsize=10, nrep=50000)
a3 <- see_the_wlln_for_uniform(nsize=50, nrep=50000)
a4 <- see_the_wlln_for_uniform(nsize=100, nrep=50000)
plot(a4, type="l", lwd=2, ylim=range(c(a1$y, a2$y, a3$y, a4$y)), col=1, 
lty=1, xlab="mean", ylab="density estimates")
lines(a3, type="l", lwd=2, col=2, lty=2)
lines(a2, type="l", lwd=2, col=3, lty=3)
lines(a1, type="l", lwd=2, col=4, lty=4)
symb <- c("n=1", "n=10", "n=50", "n=100")
legend(x=0.37, y=11.5, legend = symb, lty =4:1, col = 4:1)
Any scripts or data that you put into this service are public.
ipsRdbs documentation built on May 29, 2024, 4:15 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ipsRdbs
Introduction to Probability, Statistics and R for Data-Based Sciences

inst/doc/vignette.R
In ipsRdbs: Introduction to Probability, Statistics and R for Data-Based Sciences

Try the ipsRdbs package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ipsRdbs Introduction to Probability, Statistics and R for Data-Based Sciences

inst/doc/vignette.R In ipsRdbs: Introduction to Probability, Statistics and R for Data-Based Sciences

Try the ipsRdbs package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ipsRdbs
Introduction to Probability, Statistics and R for Data-Based Sciences

inst/doc/vignette.R
In ipsRdbs: Introduction to Probability, Statistics and R for Data-Based Sciences