knitr::opts_chunk$set(echo = TRUE, strip.white = FALSE, warning = FALSE,message = FALSE)
setwd("C:/Users/lisanjie2/Desktop/TEACHING/1_STATS_CalU/1_STAT_CalU_2016_by_NLB/1_STAT_Duq_2017/HOMEWORK/Duq_Biostats_Homework_4_Milk_regression/homework4_RMD_vs2") dat2 <- read.csv("milk_subset.csv")
Skibiel et al 2013. Journal of Animal Ecology. The evolution of the nutrient composition of mammalian milks. 82: 1254-1264.
They define relative lactation duration as:
"Lactation length was analysed relative to the total period of reproduction so that relative lactation length = absolute lactation length/(absolute gestation length + absolute lactation length)."
Create a variable for relative lactation length. Use the with() command to make the code clean
dat2$rel.lactation <- with(dat2, lacatation.months/(gestation.month + lacatation.months))
dat2$rel.lact.log10 <- log10(dat2$rel.lactation)
Relative Lactation duration is a proportion, so a logit transformation may be more appropriate.
#the car package has a logit function library(car) #apply logit transformation dat2$rel.lact.logit <- logit(dat2$rel.lactation,percents = T)
library(ggplot2) qplot(y = fat.log10, x = rel.lactation, data = dat2) + ggtitle("no transformation of x")
qplot(y = fat.log10, x = rel.lact.log10, data = dat2)+ ggtitle("log10 transformation of x")
qplot(y = fat.log10, x = rel.lact.logit, data = dat2)
qplot(y = fat.logit, x = rel.lact.logit, data = dat2)
Fit models of un-transformed lactation duration and compare to log and logit transformations
#model 1: log f.rellact.raw <- lm(fat.log10 ~ rel.lactation, data = dat2) #modlel 2: log-log f.rellact.log10 <- lm(fat.log10 ~ rel.lact.log10, data = dat2) #model 3: log-logit f.rellact.logit <- lm(fat.log10 ~ rel.lact.logit, data = dat2)
The anova() command gives us the p value for the slope
anova(f.rellact.raw) anova(f.rellact.log10) anova(f.rellact.logit)
We can't compare these 3 models directly, but we can assess how well the fit the data by looking at their residuals
Resid vs. fitted looks pretty bad - definite triangle shape.
par(mfrow = c(2,2)) plot(f.rellact.raw)
Resid vs. fitted looks pretty bad - can't say its really worse necessarily, but my intuition is that it is.
plot(f.rellact.log10)
plot(f.rellact.logit)
They looked at different combinations of these predictors
Fit models with all of the combinations of these predictors - this is a long list
#Model w/no predictors ###Note: authors don't fit this model f.null <- lm(fat.log10 ~ 1, data = dat2)
#continous predictor ##rel lactation length (continous) f.rellact <- lm(fat.log10 ~ rel.lact.log10, data = dat2) #categorical predictor ##Biome (categorical) f.biome <- lm(fat.log10 ~ biome, data = dat2) ##Diet (cat.) f.diet <- lm(fat.log10 ~ diet, data = dat2) ##Arid-adapted (cat) f.arid <- lm(fat.log10 ~ arid, data = dat2)
#Biome + diet (2 categorical vars) f.biome.diet <- lm(fat.log10 ~ biome + diet, data = dat2) #Rel lac length + diet (1 continous, 1 cat) f.lact.diet <- lm(fat.log10 ~ rel.lact.log10 + diet, data = dat2) #Rel lac length + biome (1 cont, 1 cat) f.lact.biome <- lm(fat.log10 ~ rel.lact.log10 + biome, data = dat2)
#Re lac + diet + biome (1 cont, 2 cat) f.lact.biome.diet <- lm(fat.log10 ~ rel.lact.log10 + biome + diet, data = dat2)
We can get individual AIC values for a model using AIC()
AIC(f.biome)
QUERSTION: How do you interpret an AIC value?
ANSWER: You can't; on its own, and AIC value is meaningless
library(bbmle) AICtab(base= TRUE, f.null, f.biome, f.rellact, f.diet, f.arid, f.biome.diet, f.lact.diet, f.lact.biome, f.lact.biome.diet )
ICtab(type = "AICc", base = TRUE, f.null, f.biome, f.rellact, f.diet, f.arid, f.biome.diet, f.lact.diet, f.lact.biome, f.lact.biome.diet)
#Re lac + diet + biome (1 cont, 2 cat) f.lact.biome.diet <- lm(fat.log10 ~ rel.lact.log10 + biome + diet, data = dat2)
Let's fit a model with %fat transformed on the logit scale
f.lact.biome.diet.LOGIT <- lm(fat.logit ~ rel.lact.log10 + biome + diet, data = dat2)
AICtab(base = TRUE, f.lact.biome.diet, f.lact.biome.diet.LOGIT)
QUESTION
ANSWER
Let's make a new dataset from our milk data and create some missing data
dat.fake <- dat2
I'll randomly remove 5 observations from just the "diet column"
#runif() draws random numbers from a uniform distribution i.random <- runif(n = 5, min = 1, max = length(dat2$diet)) i.random <- round(i.random) #Change random variables to NA dat.fake[i.random, "diet"] <- NA
Now I"ll fit a model with rel.lact.log10 and diet to the original data
#just relative lacation m.rel.lac <- lm(fat.logit ~ rel.lact.log10, data = dat2) #relative lacation and diet m.rel.lac.diet <- lm(fat.logit ~ rel.lact.log10 + diet, data = dat2)
Now I'll fit a model to the faked data
#just relative lacation fake.m.rel.lac <- lm(fat.logit ~ rel.lact.log10, data = dat.fake) #relative lacation and diet fake.m.rel.lac.diet <- lm(fat.logit ~ rel.lact.log10 + diet, data = dat.fake)
Compare models with correct data
AICtab(base = TRUE, m.rel.lac, m.rel.lac.diet)
Compare models with faked data
AICtab(base = TRUE, fake.m.rel.lac, fake.m.rel.lac.diet)
The original and "fake" models with just lactation duration as a predictor are the same b/c they are fit to the same dataset
AICtab(base = TRUE, m.rel.lac, fake.m.rel.lac)
The original and "fake" models with "diet" add have different AICs because they are fit to different data
AICtab(base = TRUE, m.rel.lac.diet, fake.m.rel.lac.diet)
We will only see an error if we use AICc. This is because AICc incorporates sample size into its equation
ICtab(base = TRUE, type = "AICc", m.rel.lac.diet, fake.m.rel.lac.diet)
We'll also see an error if we try anova() on the two fake models
#test "fake" anova(fake.m.rel.lac, fake.m.rel.lac.diet)
AIC(f.lact.biome.diet) plot(f.lact.biome.diet)
QUESTION What do you thin about these residuals?
ANSWER The resid vs. fitted model is still pretty bad; addition of biome and diet as predictors do not improve the model's fit.
First, I'll create indices for the 3 diet groups using which()
#indices for each diet group i.omni <- which(dat2$diet == "omnivore") i.carn <- which(dat2$diet == "carnivore") i.herb <- which(dat2$diet == "herbivore")
#plot all raw data par(mfrow = c(1,1)) plot(fat.log10 ~ rel.lact.log10, data = dat2) #color carnivores points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,], col = "red") #color herbivores points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,], col = "green") #add a legend legend("bottomleft", legend = c("Carnivore", "Omnivore", "Herbiore"), col = c("red", "green", "black"), pch = 1)
Generate predictions for plotting works best if we create a new dataframe with organized data the span the entire original dataset.
The range of lactation duration values
# ranges uses min() and max() rng <- range(dat2[,"rel.lact.log10"])
Levels of the factor variables.
#levels of the diet factor levs1 <- levels(dat2[,"diet"]) levs1 #levels of the biome fator levs2 <- levels(dat2[,"biome"])
Create a new dataframe from which to generate predictions
newdat <- expand.grid(rel.lact.log10 = rng, diet = levs1, biome = levs2) newdat
Note: I'm using the model with lactation and diet (f.lact.diet), NOT the one best model which also included "biome" (f.lact.biome.diet).
y.hat <- predict(f.lact.diet, newdata = newdat)
POP QUIZ y.hat - y = ??????
ANSWER* y.hat - y = residuals
Add the y.hat values (predictions) to the newdat dataframe
newdat$fat.y.hat <- y.hat
f.null, f.biome, f.rellact, f.diet, f.arid, f.biome.diet, f.lact.diet, f.lact.biome, f.lact.biome.diet
#Plot points ## plot base plot plot(fat.log10 ~ rel.lact.log10, data = dat2) ##plot red points points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,],col = "red") ##plot green points points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,],col = "green") #Plot regression lines ## Indices for each subset i.carn.newdat <- which(newdat$diet == "carnivore") i.omni.newdat <- which(newdat$diet == "omnivore") i.herb.newdat <- which(newdat$diet == "herbivore") ## Plot lines ### type = "l" makes points() plot lines points(fat.y.hat ~ rel.lact.log10, data = newdat[i.carn.newdat,], col = "red", type = "l") points(fat.y.hat ~ rel.lact.log10, data = newdat[i.omni.newdat,], col = "black", type = "l") points(fat.y.hat ~ rel.lact.log10, data = newdat[i.herb.newdat,], col = "green", type = "l")
QUESTION It looks like the herbivore and omnivore lines are almost exactly on top of each other. What does this mean?
ANSWER This means the must have very similar intercept parameters. I can check this with coef()
coef(f.lact.diet)
Both "dietherbivore" and "dietomnivore" have intercepts of about -0.57.
QUESTION How do you write this as a regression equation?
GENERAL MODEL STATEMENT y = Intercept + rel.lact.log10 + diet
The "diet" category has 3 levels
EQUATION WITH COEFFICIENTS y = 1.2 + -0.45rel.lact.log10 + -0.58(Is diet herbivore?) + -0.57*(Is diet omnivore?)
Here's a trick: I"m going to put all of our plotting code into a function
make.plot <- function(){ #Plot points ## plot base plot plot(fat.log10 ~ rel.lact.log10, data = dat2) ##plot red points points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,],col = "red") ##plot green points points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,],col = "green") #Plot regression lines ## Indices for each subset i.carn.newdat <- which(newdat$diet == "carnivore") i.omni.newdat <- which(newdat$diet == "omnivore") i.herb.newdat <- which(newdat$diet == "herbivore") ## Plot lines ### type = "l" makes points() plot lines points(fat.y.hat ~ rel.lact.log10, data = newdat[i.carn.newdat,], col = "red", type = "l") points(fat.y.hat ~ rel.lact.log10, data = newdat[i.omni.newdat,], col = "black", type = "l") points(fat.y.hat ~ rel.lact.log10, data = newdat[i.herb.newdat,], col = "green", type = "l") }
Now make the plot
make.plot()
Make the plot and add intercept
#make plot make.plot() #add verticle line at intercept abline(v = 0) #get intercept term from model int <- coef(f.lact.diet)[1] #plot arow for intercept arrows(y0 = int, x0 = -0.2, y1 = int,x1 = 0, length = 0.1,col = 2,lwd = 3)
Make the plot and add intercept+"dietherbivore" term
#make plot make.plot() #add verticle line at intercept abline(v = 0) #get intercept term from model int <- coef(f.lact.diet)[1] dietherbivore <- coef(f.lact.diet)[3] #add the intercept and "dietherbivore" terms together int.plus.dietherbivore <- int+dietherbivore #plot arow for intercept arrows(y0 = int, x0 = -0.2, y1 = int,x1 = 0, length = 0.1,col = 2,lwd = 3) #plot arow for intercept+dietherbivore arrows(y0 = int.plus.dietherbivore, x0 = -0.2, y1 = int.plus.dietherbivore, x1 = 0, length = 0.1,col = 3, lwd = 3)
Index values of weird groups
# i.aust <- which(dat2$Australia == "Australia") i.lemur <- which(dat2$lemurs == "lemur") i.odd.ung <- which(dat2$order == "Perrissodactyla")
#make plot make.plot() #plot points ## australia spp points(fat.log10 ~ rel.lact.log10, data = dat2[i.aust,], col = "black", pch = 16)
#make plot make.plot() ##lemurs points(fat.log10 ~ rel.lact.log10, data = dat2[i.lemur,], col = "black", pch = 22)
#make plot make.plot() ## odd-toed ungulates points(fat.log10 ~ rel.lact.log10, data = dat2[i.odd.ung,], col = "black", pch = 3)
#make plot make.plot() ## australia spp points(fat.log10 ~ rel.lact.log10, data = dat2[i.aust,], col = "black", pch = 16) ##lemurs points(fat.log10 ~ rel.lact.log10, data = dat2[i.lemur,], col = "black", pch = 22) ## odd-toed ungulates points(fat.log10 ~ rel.lact.log10, data = dat2[i.odd.ung,], col = "black", pch = 3) #plot legend legend("bottomleft",pch = c(16,22,3,1), legend = c("Australia","lemur","even-toed ungulates","other"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.