knitr::opts_chunk$set(echo = FALSE)
library(DiagrammeR)
library(knitr)
library(kableExtra)
library(FD)
library(vegan)
library(datasets)
library(datasets)
library(FSA)
library(dplyr)
library(edcpR)
library(car)

Session 5: Comparing samples

Exercises for today

Defining the correct statistical test based on different case studies and implement this in R.

Question 1

unloadNamespace("plant_growth")
rm("plant_growth")
# First update package
library(edcpR)
data(plant_growth, package = "edcpR")

We study the effect of a new fertilizer (NPK) on the growth of maize. We measured the yield in 10 control plots (Ctrl) vs 10 plots that received the fertilizer treatment.

Research question: What is the effect of fertilizer on the biomass yield of maize?

Question 1 Solution (1)

data(plant_growth, package = "edcpR")
colnames(plant_growth) <- c("Yield", "Treatment")
head(plant_growth)

Question 1 Solution (2)

str(plant_growth)
plant_growth$Treatment <- as.factor(plant_growth$Treatment)
str(plant_growth)

Question 1 Solution (3)

# Difference in a quantitative variable between two groups
# Sample size  = 10
# Too small for a parametric test --> non-parametric test preferred!
# Independent samples? --> check!
# Independent two-group Mann-Whitney test
wilcox.test(Yield ~ Treatment, data = plant_growth)

Let's consider sample size being large enough...

Question 1 Solution (4)

m<-mean(plant_growth$Yield)
std<-sqrt(var(plant_growth$Yield))
# We can check if normality holds
hist(plant_growth$Yield ~ plant_growth$Treatment, prob=TRUE, xlab="Yield",main="Histogram of Yield")

Question 2

library(datasets)
data(iris, package = "datasets")

We studied the differences in flower dimensions between three iris species (Iris setosa, Iris versicolor, Iris virginica). We sampled 10 individuals of each species and measured the sepal and petal length and width.

Research question: Do the three iris species have different sepal lengths?

Question 2 Solution

unloadNamespace("iris")
rm("iris")
data(iris, package = "datasets")
head(iris)
iris <- iris[, c(1,5)]
colnames(iris) <- c("sepal_length", "Species")

Question 2 Solution

str(iris)
iris$Species <- as.factor(iris$Species)

Question 2 Solution

# Difference in a quantitative variable between 3 samples
# Total observations = 150
# 50 observations per  --> assumptions
# Independent samples? --> check!

Question 2 Solution

par(mfrow = c(1,3))
hist(iris %>% filter(Species == "setosa") %>% pull(sepal_length), prob=TRUE, xlab="Sepal lenghth",main="Histogram of sepal length")
hist(iris %>% filter(Species == "versicolor") %>% pull(sepal_length) , prob=TRUE, xlab="Sepal lenghth",main="Histogram of sepal length")
hist(iris %>% filter(Species == "virginica") %>% pull(sepal_length) , prob=TRUE, xlab="Sepal lenghth",main="Histogram of sepal length")

Question 2 Solution

par(mfrow = c(1,3))
qqnorm(iris %>% filter(Species == "setosa") %>% pull(sepal_length), xlab="Sepal lenghth",main="Histogram of sepal length")
qqline(iris %>% filter(Species == "setosa") %>% pull(sepal_length), col = "red", lwd = 2)
qqnorm(iris %>% filter(Species == "versicolor") %>% pull(sepal_length), xlab="Sepal lenghth",main="Histogram of sepal length")
qqline(iris %>% filter(Species == "versicolor") %>% pull(sepal_length), col = "red", lwd = 2)
qqnorm(iris %>% filter(Species == "virginica") %>% pull(sepal_length), xlab="Sepal lenghth",main="Histogram of sepal length")
qqline(iris %>% filter(Species == "virginica") %>% pull(sepal_length), col = "red", lwd = 2)

Question 2 Solution

# Statistical test for normality (Shapiro-Wilk)
# Not recommended, because it is too strict
shapiro.test(iris %>% filter(Species == "setosa") %>% pull(sepal_length))
shapiro.test(iris %>% filter(Species == "versicolor") %>% pull(sepal_length))
shapiro.test(iris %>% filter(Species == "virginica") %>% pull(sepal_length))

Question 2 Solution

# We can check if homoscedasticity holds
# Strong evidence for normal distribution --> Bartlets test has better performance
bartlett.test(sepal_length ~ Species, data = iris)

Question 2 Solution

# Non-parametric tests: Kruskall-Wallis
kruskal.test(sepal_length ~ Species, data = iris)

Question 2 Solution

# Post-hoc testing to check between samples
library(FSA)
# Benjamini-Hochberg adjustment, not Bonferonni (too strict)
dunnTest(sepal_length ~ Species, data = iris, method= "bh")

Question 2 Solution

# Let's consider homoscedasticity holds ...
# Compute the ANOVA
res.aov <- aov(sepal_length ~ Species, data = iris)
summary(res.aov)

Question 2 Solution

# Post-hoc testing to check between samples
# Tuckey post-hoc test
TukeyHSD(res.aov)

Question 3

data(meerdaal, package = "edcpR")

We have vegetation records in Meerdaal forest from 50 years ago. We know the exact location of the 13 plots and sample them again in 2018.

Research question: Is there a difference in species richness between old and recent sampling of forest fragments in Meerdaal forest?

Question 3 Solution (1)

data(meerdaal, package = "edcpR")
head(meerdaal)
colnames(meerdaal) <- c("Plot", "Old", "Recent")

# Difference in a quantitative variable between 2 paired samples
# Sample size = 13 
# Too small for a parametric test --> non-parametric test
# Assumption: independent samples --> false, it is resampling of the same plots!

Question 3 Solution (2)

# Wilcoxon signed rank test
wilcox.test(meerdaal$Old, meerdaal$Recent, paired = TRUE)

Question 4

data(kembel_sp, package ="edcpR")

We measured the abundance of species in percent cover of different plant species in 20x20m quadrats in grasslands in plots in two habitat types in Alberta.

Research question: Is there a difference in total species richness between two grassland types?

Question 4 Solution (1)

# We are comparing gamma diversity!
# If we want to compare the mean diversity per plot (alfa diversity, use t-test)
# Chao estimators of two groups with t-test (see Session 3: Diversity)

library(dplyr)
# Extract plots of mixed grasslands
mixed <- kembel_sp %>% filter(grepl("mix", plot)) %>% select(-plot)

# Extract plots with fescue grass
fes <- kembel_sp %>% filter(grepl("fes", plot)) %>% select(-plot)

Question 4 Solution (2)

library(vegan)
# Get the chao-estimators for each grassland
mixed_gamma <- specpool(mixed)
fes_gamma <- specpool(fes)

chao_mixed <- mixed_gamma$chao
chao_mixed_se <- mixed_gamma$chao.se
chao_fes <- fes_gamma$chao
chao_fes_se <- fes_gamma$chao.se

Question 4 Solution (3)

# Take 100 random samples from a normal distribution!
sample_mixed <- rnorm(100, mean = chao_mixed, sd = chao_mixed_se)
sample_fes <- rnorm(100, mean = chao_fes, sd = chao_fes_se)

Question 4 Solution (4)

# Perform a t-test
t.test(sample_mixed, sample_fes, paired = FALSE)

Assignment 4

Available on Toledo under Assignments (Assignment 4). Submit:

 

Upload everything before December 1st, 12 pm (= at noon!).



wardfont/edcpR documentation built on Dec. 23, 2021, 5:07 p.m.