knitr::opts_knit$set(root.dir = "/Users/Megan/Documents/GitHub/CAMI/data/Marx_data")

CAMI tutorial #2 {-}

1. Introduction

Marx et al (2015) surveyed 442 vascular plants species across 80 islands in the San Juan archipelago, which is a series of islands in between Washington state and Vancouver Island, BC, Canada. They reconstrcuted the community phylogeny for all of these species, using a supertree appraoch, and gathered data on several functional traits (seed mass, max height, specific leaf area (SLA), leaf size, and leaf N content).

In their analyses, they investigated mean pairwise distance (MPD) and mean-nearest taxon distance (MNTD) using phylgoenetic infornationa, and then also using the functional trait information. They used these standardized metrics as test statistics for significant over and under-dispersion compared to randomly assembly communities in statistical hypotheis testing.

We advocate that using CAMI is an alternative to statistical hypothesis testing, as inferece here is based on performing model-selection with random forests (RF) and Approximate Bayesian Computation (ABC). In this, we can incorperate uncertainty in our model inference and simulatenously compare the support for each assembly process.

Below, we go through analysizing Marx et al.'s (2015) data in CAMI.

2. Importing Data

2.1 Phylogenetic

We will first load the community phylogenetic tree constrcuted by Marx et al. (2015). This tree has 367 of the species identified on th San Juan Islands. The tree is a result of combining publically available sequence data from GenBank for five gene regions (atpB, rbcL, matK, trnTLF and ITS) using the PHLAWD pipeline (Smith et al., 2009). The phylogenetic tree was infered using Maximum Likelihood.

Make sure you're in the same directory as the tree file. You can use the command setwd("path/to/directory").

require(geiger)

## the tree is imported as a object of class phylo
SanJuanTree <- read.tree("Marx.tre")

## explore the phylo object
SanJuanTree

2.2 Community Matrix

Next, we will load the community data matrix which identifies which species are located on what islands. In this matrix, the rows will be all of the species present on all islands and the columns each represent a different island. Inside the cells is then presence/absence data.

## the community matrix is imported as a data frame object
communityMatrix <- read.csv(file="CommunityDataMatrix.csv", sep=",", header=T, row.names = 1)

## we can look at the first 5 rows with head, and distinguish only the first 5 columns with [] indexing brackets.
head(communityMatrix[,1:4])

## also can load some island Metadata
metadata <- read.csv(file="IslandMetadata.txt", header = T, sep = ",", row.names = 1)

2.3 Functional Traits

Finally, we will load the functional trait data matrix. Again, in this matrix, each row is a species and each column corresponds to a trait. The traits data includes native/invasive status, seed mass (mg), maximum height (m), specific leaf area (SLA, cm2), leaf size (cm2), and leaf N content.

## the functional trait data is also imported as a data frame object
traitData <-  read.csv(file="FunctionalTraitData.csv", sep=",", header=T, row.names = 1)

## note there is missing data present as NAs
head(traitData)

## which traits have the most missing data associated with them?
colSums(is.na(traitData))

3. Community Assembly Simulations

3.1 Trait Evolution Model

We will investigate seed mass (mg) as our functional trait of interest, first. For this we need to see what model of trait evolution seed mass fits, given the species that we have data for in the community. We will be comparing two models of trait evolution, Brownian Motion and Ornstein-Uhlenbeck. If we can determine the best model of trait evolution to simulate under, then we can cut the number of models we would need to simulate under in CAMI in half.

pick a trait here

## PICK THE TRAIT HERE
TraitVec <- traitData$Maximum.Height..m.
names(TraitVec) <- rownames(traitData)
TraitVec <- TraitVec[names(TraitVec) %in% SanJuanTree$tip.label]

## remove species from tree when there is missing trait data
missingSpecies <- names(TraitVec)[is.na(TraitVec)]
RegTraitTree <- drop.tip(phy = SanJuanTree, tip = missingSpecies)
RegTraitTree <- drop.tip(phy = RegTraitTree, tip = "Arctostaphylos_media")

## remove species from trait data if missing 
TraitVec <- TraitVec[!is.na(TraitVec)]

## check the new tree and trait data objects to make sure they match with name.check()
name.check(RegTraitTree, TraitVec)

Now that the data is somewhat curated, we will figure out which model of trait evolution best fits the seed mass data given the phylogeny and trait values. We will use the fitConintuous() function from geiger to fit both the BM and OU models of trait evoluton to the data. We will perform model selection using AIC, meaning the model with the lower AIC will be the model we simulate data under.

## if trait is seed mass, you mgiht want to do this bc these two species' seed mass is super large
RegTraitTree <- drop.tip(RegTraitTree, "Quercus_garryana")
TraitVec <- TraitVec[names(TraitVec) != "Quercus_garryana"]
RegTraitTree <- drop.tip(RegTraitTree, "Corylus_cornuta")
TraitVec <- TraitVec[names(TraitVec) != "Corylus_cornuta"]

hist(TraitVec)
#RegTraitTree, TraitVec
BM.mod <- fitContinuous(phy = RegTraitTree, dat = log10(TraitVec)*10, model = "BM", control=list(niter=100))
OU.mod <- fitContinuous(phy = RegTraitTree, dat = log10(TraitVec)*10, model = "OU", bounds=list(alpha=c(0.001, 0.02)), control=list(niter=100))

summary(TraitVec)
## you can use '$opt' after the objects to   at the lnL estimated for each model, as well as the AICc calculated.
c(BM.mod$opt$aicc, OU.mod$opt$aicc) - min(BM.mod$opt$aicc, OU.mod$opt$aicc)

3.2 Simulate Assembly data

sims <- 1000
N <- length(RegTraitTree$tip.label)
local <- c(25,150)

require("CAMI")
#simulate data on crick under emp sims screen
AssemblyData <- SimCommunityAssembly(sims, N, local, 
                                       traitsim = "OU", 
                                       comsim = "competition", 
                                       sig2 = OU.mod$opt$sigsq, 
                                       alpha = OU.mod$opt$alpha, 
                                       tau=c(1,60))


#AssemblyData_SLA <- AssemblyData
load(file="OUAssemblyData.Rdata")
AssemblyData <- OUassemblyData
````


#### 3.3 Summary Statistics

```r

SanJuanCommunities <- communityMatrix[,colSums(communityMatrix) >= 20]
SanJuanCommunities <- SanJuanCommunities[,-1]
SanJuanCommunities <- SanJuanCommunities[rownames(SanJuanCommunities) %in%  names(TraitVec),]


for (i in 1:ncol(SanJuanCommunities)) {
  species.to.drop <- rownames(SanJuanCommunities)[SanJuanCommunities[,i]==0]
  local.tree <- drop.tip(RegTraitTree, species.to.drop)
  if (i == 1){
    SanJuanLocalTrees <- local.tree }
  else{  SanJuanLocalTrees <- c(SanJuanLocalTrees, local.tree)
}}

localcomsizes <- c()
for (i in 1:ncol(SanJuanCommunities)) {
  c <- length(SanJuanLocalTrees[[i]]$tip.label)
  localcomsizes <- c(localcomsizes, c)
}  

range(localcomsizes)

## Get corresponding trait datasets
SJlocalTraits <- list()
for (i in 1:ncol(SanJuanCommunities)) {
 local.traits <- TraitVec[ match(SanJuanLocalTrees[[i]]$tip.label, names(TraitVec))]
 SJlocalTraits[[i]] <- local.traits
}

## calculate summary statistics
SJsummaryStats <- matrix(NA, ncol(SanJuanCommunities), 30)
for (i in 1:ncol(SanJuanCommunities)){
  stats<-  CalcSummaryStats(regional.tree = RegTraitTree,
                                      local.tree = SanJuanLocalTrees[[i]],
                                      regional.traits = log10(TraitVec),
                                      local.traits = log10(SJlocalTraits[[i]]))
  SJsummaryStats[i,] <- stats
  colnames(SJsummaryStats) <- names(stats)
}
rownames(SJsummaryStats) <- colnames(SanJuanCommunities)

4. Random Forests

modelIndex <- c()
modelIndex <- as.character(AssemblyData$params[,2])
summaryStats <- AssemblyData$summary.stats
ref.table <- c()
ref.table <- data.frame(summaryStats[,-c(17,18,21,22,29)], modelIndex)
ref.table <- na.omit(object = ref.table)

#determine rf error rates using all data
#require(randomForest)
rf <- randomForest(modelIndex ~., data=ref.table, ntree=1000, importance=T)

SJModPredictions <- predict(rf, SJsummaryStats[,-c(17,18,21,22,29)], type="vote")
colSums(SJModPredictions)

#create color palette:
library(RColorBrewer)
coul = brewer.pal(5, "Set2") 

t(SJModPredictions)

# Make a stacked barplot--> it will be in %!
pdf(file = "seedMass_barplotSanJan.pdf", width = 5, height = 4)
barplot(t(SJModPredictions), col=coul , border="white", xlab="San Juan Communities" )
#legend("top", legend = c("competition", "filtering", "neutral"), fill = coul, cex=2)

Random Forest with parameter estimates

##might be good to look at some params.
require(ggplot2)
trues <- AssemblyData$params[,2]=="competition"
TauIndex <- as.numeric(paste(AssemblyData$params[trues,10])) 

summaryStatsTau <- AssemblyData$summary.stats[trues,-c(17,18,21,22,29)]
ref.table.Regress <- data.frame(summaryStatsTau, TauIndex)
ref.table.Regress <- na.omit(object = ref.table.Regress)
rf.param <- randomForest(TauIndex ~., data=ref.table.Regress, ntree=1000, importance=T)

SJParamPredictions <- predict(rf.param, SJsummaryStats[,-c(17,18,21,22,29)], type="response")
hist(SJParamPredictions)

##make plot
pdf('Heigt_Competiton_SanJuanParam.pdf', width=5, height=4)
tau.pp <- data.frame(dt=factor(c(rep("prior", each=nrow(AssemblyData$params)), 
                                 rep("posterior", each= length(SJParamPredictions)))),
                     tau = c(as.numeric(paste(AssemblyData$params[trues,10])),  SJParamPredictions))

ggplot(tau.pp, aes(x=tau, fill=dt)) +
  geom_density() +
  scale_fill_manual(values=c(rgb(.2, .2, .2, .8), rgb(.5, .5, .5, .4))) +
  theme(legend.position="none") +
  geom_vline(aes(xintercept=median(SJParamPredictions)), color="black", linetype="dashed", lwd=1.2)
SJModPredictions <- SJModPredictions[-c(22,27),]
SJParamPredictions <- SJParamPredictions[-c(22,27)]
SJ_metadata <- SJ_metadata[-c(22,27),]

plot(SJParamPredictions~ SJ_metadata[,1])
lm(SJParamPredictions~ SJ_metadata[,1])
plot(SJModPredictions[,2]~ SJParamPredictions[])

plot(SJModPredictions[,2] ~ SJ_metadata[,1])

## This is what I will plot and talk about
plot(SJModPredictions[,2] ~ SJ_metadata[,1])
abline(lm(SJModPredictions[,2] ~ SJ_metadata[,1]))

plot(SJParamPredictions~ SJ_metadata[,1], xlim=c(0,50000))
abline(lm(SJParamPredictions ~ SJ_metadata[,1]))


colnames(SJ_metadata)

pdf('ModProb_V_TauEst_8Kipukas.pdf', width=6, height=4.5)
plot(SJModPredictions[,2] ~ SJ_metadata[,1], pch=21, cex=1.8, ylab="Model Support for Environmental Filtering", xlab="T_E Median Estimate", bty="l", bg="grey80")
abline(lm(SJModPredictions[,2] ~ SJ_metadata[,1]), lwd=1.5, lty=5, col="grey40")

ABC

require(ggplot2)
Tau.estimates <- list()
trues <- OUassemblyData$params[,2]=="filtering"

TauParams <- OUassemblyData$params[trues,5:10]
SJsummaryStats.ABC <- SJsummaryStats[, c(4, 10, 12, 13, 16, 18, 19, 20)]

for (i in 1:ncol(SanJuanCommunities)){
  SJ.Param.Est <- abc(target = SJsummaryStats.ABC[i,], 
                      param = TauParams, 
                      sumstat = summaryStatsTau[,c(4, 10, 12, 13, 16, 18, 19, 20)], tol = 0.01, method="rejection")

  tau <- as.numeric(paste(SJ.Param.Est$unadj.values[,colnames(SJ.Param.Est$unadj.values) == "tau"]))

  Tau.estimates[[i]] <- tau
}

median.taus <- c()
for (i in 1:ncol(SanJuanCommunities)) {
  median.taus <- c(median.taus, median(Tau.estimates[[i]]))
}

plot(SJModPredictions[,2]~ median.taus)

islands <- colnames(SanJuanCommunities)

SJ_metadata <- metadata[match(islands, rownames(metadata)),]
colnames(SJ_metadata)

SJModPredictions <- SJModPredictions[-c(22,27),]
SJ_metadata <- SJ_metadata[-c(22,27),]
median.taus <- median.taus[-c(22,27)]

plot(SJModPredictions[,2]~ SJ_metadata[,1])

which(SJ_metadata[,3] >= 40 & SJ_metadata[,3] <= 60)

plot(median.taus ~ SJ_metadata[,1])

hist(SJ_metadata[,3])

which(SJ_metadata[,1] > 300000)


SJModPredictions_2 <- SJModPredictions[which(SJ_metadata[,3] >= 40 & SJ_metadata[,3] <= 60),]
SJ_metadata_2 <- SJ_metadata[which(SJ_metadata[,3] >= 40 & SJ_metadata[,3] <= 60),]
median.taus_2 <- median.taus[which(SJ_metadata[,3] >= 40 & SJ_metadata[,3] <= 60)]

plot(median.taus_2 ~ SJ_metadata_2[,1])
plot(SJModPredictions_2[,2] ~ SJ_metadata_2[,1])

plot(SJModPredictions_2[,2] ~ median.taus_2)


pdf('ModProb_V_TauEst_8Kipukas.pdf', width=6, height=4.5)
plot(ParamEstDataTable[,3] ~ ParamEstDataTable[,4], pch=22, cex=1.8, ylab="Model Support for Environmental Filtering", xlab="Tau Median Estimate", bty="l", bg="grey80")
abline(lm(ParamEstDataTable[,3] ~ ParamEstDataTable[,4]), lwd=1.5, lty=5, col="grey40")
points(ParamEstDataTable[,2] ~ ParamEstDataTable[,4], pch=16, cex=1.5, ylab="Model Support for Environmental Filtering", xlab="Tau Median Estimate")
abline(lm(ParamEstDataTable[,2] ~ ParamEstDataTable[,4]), lwd=1.5, lty=1, col="black")
legend("topright", legend = c("ABC", "RF"), pch=c(22, 16), col=c("black", "black"), lty=c(5,1), cex=1.5, pt.bg=c("grey80", "black"), pt.cex=c(1.8, 1.5), lwd=c(1.2,1.5), seg.len=4, bty="n")


ruffleymr/CAMI documentation built on Aug. 21, 2019, 2:55 p.m.