knitr::opts_chunk$set(echo = TRUE, strip.white = FALSE, warning = FALSE,message = FALSE)
setwd("C:/Users/lisanjie2/Desktop/TEACHING/1_STATS_CalU/1_STAT_CalU_2016_by_NLB/1_STAT_Duq_2017/HOMEWORK/Duq_Biostats_Homework_4_Milk_regression/homework4_RMD_vs2")
dat2 <- read.csv("milk_subset.csv")

Introduction

Original Data

Skibiel et al 2013. Journal of Animal Ecology. The evolution of the nutrient composition of mammalian milks. 82: 1254-1264.

Model selection tutorial: Relative lactation length

They define relative lactation duration as:

"Lactation length was analysed relative to the total period of reproduction so that relative lactation length = absolute lactation length/(absolute gestation length + absolute lactation length)."

Data setup

Relative lactation duration

Create a variable for relative lactation length. Use the with() command to make the code clean

dat2$rel.lactation <- with(dat2, lacatation.months/(gestation.month +                                             lacatation.months))

Transformation

dat2$rel.lact.log10 <- log10(dat2$rel.lactation)

Relative Lactation duration is a proportion, so a logit transformation may be more appropriate.

#the car package has a logit function
library(car)

#apply logit transformation
dat2$rel.lact.logit <- logit(dat2$rel.lactation,percents = T)

Data exploration

Untransformaed dat: Plot log10(fat) ~ rel.lactation

library(ggplot2)

qplot(y = fat.log10,
      x = rel.lactation,
      data = dat2) + 
  ggtitle("no transformation of x")

Author's transformation: Plot log10(fat) ~ log10(rel.lactation)

qplot(y = fat.log10,
      x = rel.lact.log10,
      data = dat2)+ 
  ggtitle("log10 transformation of x")

Plot log10(fat) ~ logit(rel.lactation)

qplot(y = fat.log10,
      x = rel.lact.logit,
      data = dat2)

Plot logit(fat) ~ logit(rel.lactation)

qplot(y = fat.logit,
      x = rel.lact.logit,
      data = dat2)

Look at different transformations

Fit models of un-transformed lactation duration and compare to log and logit transformations

#model 1: log
f.rellact.raw <- lm(fat.log10 ~ rel.lactation, data = dat2)


#modlel 2: log-log
f.rellact.log10 <- lm(fat.log10 ~ rel.lact.log10, data = dat2)


#model 3: log-logit
f.rellact.logit <- lm(fat.log10 ~ rel.lact.logit, data = dat2)

The anova() command gives us the p value for the slope

anova(f.rellact.raw)
anova(f.rellact.log10)
anova(f.rellact.logit)

We can't compare these 3 models directly, but we can assess how well the fit the data by looking at their residuals

Look at residuals

Residuals for the raw data:

Resid vs. fitted looks pretty bad - definite triangle shape.

par(mfrow = c(2,2))
plot(f.rellact.raw)

Residuals for the log and logit

Resid vs. fitted looks pretty bad - can't say its really worse necessarily, but my intuition is that it is.

plot(f.rellact.log10)
plot(f.rellact.logit)

Model selection

They looked at different combinations of these predictors

Fit models

Fit models with all of the combinations of these predictors - this is a long list

Null model

#Model w/no predictors
###Note: authors don't fit this model
f.null <- lm(fat.log10 ~ 1,
             data = dat2)

Models w/ 1 predictor

#continous predictor
##rel lactation length (continous)
f.rellact <- lm(fat.log10 ~ rel.lact.log10,
                data = dat2)

#categorical predictor
##Biome (categorical)
f.biome <- lm(fat.log10 ~ biome,
              data = dat2)

##Diet (cat.)
f.diet <- lm(fat.log10 ~ diet, 
             data = dat2)

##Arid-adapted (cat)
f.arid <- lm(fat.log10 ~ arid,
             data = dat2)

Models w/2 predictors

#Biome + diet (2 categorical vars)
f.biome.diet <- lm(fat.log10 ~ biome + 
                               diet,
                   data = dat2)

#Rel lac length + diet (1 continous, 1 cat)
f.lact.diet <- lm(fat.log10 ~ rel.lact.log10 + 
                    diet,
                  data = dat2)


#Rel lac length + biome (1 cont, 1 cat)
f.lact.biome <- lm(fat.log10 ~ rel.lact.log10  +
                    biome, 
                  data = dat2)

Model w/3 terms

#Re lac  + diet + biome (1 cont, 2 cat)
f.lact.biome.diet <- lm(fat.log10 ~    
                          rel.lact.log10 + 
                          biome +
                          diet, 
                          data = dat2)

Model selection w/AIC

Get a single AIC value

We can get individual AIC values for a model using AIC()

AIC(f.biome)

QUERSTION: How do you interpret an AIC value?

ANSWER: You can't; on its own, and AIC value is meaningless

Get an AIC table

library(bbmle)
AICtab(base= TRUE,
       f.null,
       f.biome,
       f.rellact,
       f.diet,
       f.arid,
       f.biome.diet,
       f.lact.diet,
       f.lact.biome,
       f.lact.biome.diet
)

Get an AICc table

ICtab(type = "AICc",
      base = TRUE,
      f.null,
       f.biome,
       f.rellact,
       f.diet,
       f.arid,
       f.biome.diet,
       f.lact.diet,
       f.lact.biome,
       f.lact.biome.diet)

Check the impact of different transformations

#Re lac  + diet + biome (1 cont, 2 cat)
f.lact.biome.diet <- lm(fat.log10 ~    
                          rel.lact.log10 + 
                          biome +
                          diet, 
                          data = dat2)

Let's fit a model with %fat transformed on the logit scale

f.lact.biome.diet.LOGIT <- lm(fat.logit ~    
                          rel.lact.log10 + 
                          biome +
                          diet, 
                          data = dat2)

COmpare different transformation w/AIC

AICtab(base = TRUE,
       f.lact.biome.diet,
       f.lact.biome.diet.LOGIT)

QUESTION

ANSWER

Exploring the impact of missing data

Let's make a new dataset from our milk data and create some missing data

dat.fake <- dat2

I'll randomly remove 5 observations from just the "diet column"

#runif() draws random numbers from a uniform distribution
i.random <- runif(n = 5, min = 1, max = length(dat2$diet))

i.random <- round(i.random)

#Change random variables to NA
dat.fake[i.random, "diet"] <- NA

Now I"ll fit a model with rel.lact.log10 and diet to the original data

#just relative lacation
m.rel.lac <- lm(fat.logit ~ rel.lact.log10, 
                          data = dat2)

#relative lacation and diet
m.rel.lac.diet <- lm(fat.logit ~ rel.lact.log10 +
                       diet, 
                          data = dat2)

Now I'll fit a model to the faked data

#just relative lacation
fake.m.rel.lac <- lm(fat.logit ~ rel.lact.log10, 
                          data = dat.fake)

#relative lacation and diet
fake.m.rel.lac.diet <- lm(fat.logit ~ rel.lact.log10 +
                            diet, 
                          data = dat.fake)

Compare models with correct data

AICtab(base = TRUE,
       m.rel.lac,
       m.rel.lac.diet)

Compare models with faked data

AICtab(base = TRUE,
       fake.m.rel.lac,
       fake.m.rel.lac.diet)

The original and "fake" models with just lactation duration as a predictor are the same b/c they are fit to the same dataset

AICtab(base = TRUE,
       m.rel.lac,
       fake.m.rel.lac)

The original and "fake" models with "diet" add have different AICs because they are fit to different data

AICtab(base = TRUE,
       m.rel.lac.diet,
       fake.m.rel.lac.diet)

We will only see an error if we use AICc. This is because AICc incorporates sample size into its equation

ICtab(base = TRUE,
      type = "AICc",
       m.rel.lac.diet,
       fake.m.rel.lac.diet)

We'll also see an error if we try anova() on the two fake models

#test "fake"
anova(fake.m.rel.lac,
       fake.m.rel.lac.diet)

Model Diagnostics:

AIC(f.lact.biome.diet)

plot(f.lact.biome.diet)

QUESTION What do you thin about these residuals?

ANSWER The resid vs. fitted model is still pretty bad; addition of biome and diet as predictors do not improve the model's fit.

Plotting model output

Seperating indices for each plot element

First, I'll create indices for the 3 diet groups using which()

#indices for each diet group
i.omni <- which(dat2$diet == "omnivore")
i.carn <- which(dat2$diet == "carnivore")
i.herb <- which(dat2$diet == "herbivore")

Plot different diets

#plot all raw data
par(mfrow = c(1,1))
plot(fat.log10 ~ rel.lact.log10, 
     data = dat2)

#color carnivores
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.carn,],
       col = "red")

#color herbivores
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.herb,],
       col = "green")


#add a legend
legend("bottomleft", 
       legend = c("Carnivore", 
                  "Omnivore",
                  "Herbiore"),
       col = c("red",
               "green",
               "black"),
       pch = 1)

Add regression lines

Generate predictions

Generate predictions for plotting works best if we create a new dataframe with organized data the span the entire original dataset.

The range of lactation duration values

# ranges uses min() and max()
rng <- range(dat2[,"rel.lact.log10"])

Levels of the factor variables.

#levels of the diet factor
levs1 <- levels(dat2[,"diet"])

levs1

#levels of the biome fator
levs2 <- levels(dat2[,"biome"])

Create a new dataframe from which to generate predictions

newdat <- expand.grid(rel.lact.log10 = rng,
                      diet = levs1,
                      biome = levs2)

newdat

Note: I'm using the model with lactation and diet (f.lact.diet), NOT the one best model which also included "biome" (f.lact.biome.diet).

Generate prediction

y.hat <- predict(f.lact.diet,
                 newdata = newdat)

POP QUIZ y.hat - y = ??????

ANSWER* y.hat - y = residuals

Add the y.hat values (predictions) to the newdat dataframe

newdat$fat.y.hat <- y.hat

Plot everything

f.null, f.biome, f.rellact, f.diet, f.arid, f.biome.diet, f.lact.diet, f.lact.biome, f.lact.biome.diet

#Plot points
## plot base plot
plot(fat.log10 ~ rel.lact.log10, data = dat2)


##plot red points
points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,],col = "red")

##plot green points
points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,],col = "green")


#Plot regression lines

## Indices for each subset
i.carn.newdat <- which(newdat$diet == "carnivore")
i.omni.newdat <- which(newdat$diet == "omnivore")
i.herb.newdat <- which(newdat$diet == "herbivore")

## Plot lines
### type = "l" makes points() plot lines
points(fat.y.hat ~ rel.lact.log10, 
       data = newdat[i.carn.newdat,],
       col = "red",
       type = "l")

points(fat.y.hat ~ rel.lact.log10, 
       data = newdat[i.omni.newdat,],
       col = "black",
       type = "l")

points(fat.y.hat ~ rel.lact.log10, 
       data = newdat[i.herb.newdat,],
       col = "green",
       type = "l")

QUESTION It looks like the herbivore and omnivore lines are almost exactly on top of each other. What does this mean?

ANSWER This means the must have very similar intercept parameters. I can check this with coef()

coef(f.lact.diet)

Both "dietherbivore" and "dietomnivore" have intercepts of about -0.57.

QUESTION How do you write this as a regression equation?

GENERAL MODEL STATEMENT y = Intercept + rel.lact.log10 + diet

The "diet" category has 3 levels

EQUATION WITH COEFFICIENTS y = 1.2 + -0.45rel.lact.log10 + -0.58(Is diet herbivore?) + -0.57*(Is diet omnivore?)

Back to heterogeneity of variance

Visualize coefficients

Here's a trick: I"m going to put all of our plotting code into a function

make.plot <- function(){
  #Plot points
## plot base plot
plot(fat.log10 ~ rel.lact.log10, data = dat2)


##plot red points
points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,],col = "red")

##plot green points
points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,],col = "green")


#Plot regression lines

## Indices for each subset
i.carn.newdat <- which(newdat$diet == "carnivore")
i.omni.newdat <- which(newdat$diet == "omnivore")
i.herb.newdat <- which(newdat$diet == "herbivore")

## Plot lines
### type = "l" makes points() plot lines
points(fat.y.hat ~ rel.lact.log10, 
       data = newdat[i.carn.newdat,],
       col = "red",
       type = "l")

points(fat.y.hat ~ rel.lact.log10, 
       data = newdat[i.omni.newdat,],
       col = "black",
       type = "l")

points(fat.y.hat ~ rel.lact.log10, 
       data = newdat[i.herb.newdat,],
       col = "green",
       type = "l")
}

Now make the plot

make.plot()

Make the plot and add intercept

#make plot
make.plot()

#add verticle line at intercept
abline(v = 0)

#get intercept term from model
int <- coef(f.lact.diet)[1]

#plot arow for intercept
arrows(y0 = int,
       x0 = -0.2,
       y1 = int,x1 = 0, 
       length = 0.1,col = 2,lwd = 3)

Make the plot and add intercept+"dietherbivore" term

#make plot
make.plot()

#add verticle line at intercept
abline(v = 0)

#get intercept term from model
int <- coef(f.lact.diet)[1]
dietherbivore <- coef(f.lact.diet)[3]


#add the intercept and "dietherbivore" terms together
int.plus.dietherbivore <- int+dietherbivore

#plot arow for intercept
arrows(y0 = int,
       x0 = -0.2,
       y1 = int,x1 = 0, 
       length = 0.1,col = 2,lwd = 3)

#plot arow for intercept+dietherbivore
arrows(y0 = int.plus.dietherbivore,
       x0 = -0.2,
       y1 = int.plus.dietherbivore,
       x1 = 0, 
       length = 0.1,col = 3,
       lwd = 3)

Further model and data explorations

What about outliers identified previously

Index values of weird groups

# 
i.aust <- which(dat2$Australia == "Australia")
i.lemur <- which(dat2$lemurs == "lemur")
i.odd.ung <- which(dat2$order == "Perrissodactyla")

Plot Australia

#make plot
make.plot()

#plot points
## australia spp
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.aust,],
       col = "black",
       pch = 16)

Plot Lemurs

#make plot
make.plot()

##lemurs
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.lemur,],
       col = "black",
       pch = 22)

Plot odd-toed ungulates

#make plot
make.plot()

## odd-toed ungulates
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.odd.ung,],
       col = "black",
       pch = 3)

Plot everything

#make plot
make.plot()


## australia spp
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.aust,],
       col = "black",
       pch = 16)

##lemurs
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.lemur,],
       col = "black",
       pch = 22)

## odd-toed ungulates
points(fat.log10 ~ rel.lact.log10, 
       data = dat2[i.odd.ung,],
       col = "black",
       pch = 3)




#plot legend
legend("bottomleft",pch = c(16,22,3,1), legend = c("Australia","lemur","even-toed ungulates","other"))


brouwern/mammalsmilk documentation built on May 17, 2019, 10:38 a.m.