PhyloFactor | R Documentation |
Regression-based phylofactorization
PhyloFactor(Data, tree, X = NULL, frmla = Data ~ X, choice = "var",
transform.fcn = log, contrast.fcn = NULL, method = "glm",
nfactors = NULL, small.output = F, stop.fcn = NULL,
stop.early = NULL, KS.Pthreshold = 0.01, alternative = "greater",
ncores = NULL, delta = 0.65, choice.fcn = NULL,
cluster.depends = "", ...)
Data |
Data matrix whose rows are tip labels of the tree, columns are samples of the same length as X, and whose columns sum to 1 |
tree |
Phylogeny whose tip-labels are row-names in Data. |
X |
independent variable. If performing multiple regression, X must be a data frame whose columns contain all the independent variables used in |
frmla |
Formula for input in GLM. Default formula is Data ~ X. |
choice |
Choice, or objective, function for determining the best edges at each iteration using default regression. Must be choice='var' or choice='F'. 'var' minimizes residual variance of clr-transformed data, whereas 'F' maximizes the F-statistic from an analysis of variance. |
transform.fcn |
Function for transforming data prior to projection onto contrast bases. Default is |
contrast.fcn |
Contrast function. Default is an efficient version of |
method |
Which default objective function to use either "glm", "max.var" or "gam". |
nfactors |
Number of clades or factors to produce in phylofactorization. Default, NULL, will iterate phylofactorization until either dim(Data)[1]-1 factors, or until stop.fcn returns T |
small.output |
Logical, indicating whether or not to trim output. If |
stop.fcn |
Currently, accepts input of 'KS'. Coming soon: input your own function of the environment in phylofactor to determine when to stop. |
stop.early |
Logical indicating if stop.fcn should be evaluated before (stop.early=T) or after (stop.early=F) choosing an edge maximizing the objective function. |
KS.Pthreshold |
Numeric between 0 and 1. P-value threshold for KS-test as default stopping-function. |
alternative |
alternative hypothesis input to |
ncores |
Number of cores for built-in parallelization of phylofactorization. Parallelizes the extraction of groups, amalgamation of data based on groups, regression, and calculation of objective function. Be warned - this can lead to R taking over a system's memory. |
delta |
Numerical value for replacement of zeros. Default is 0.65, so zeros will be replaced column-wise with 0.65*min(x[x>0]) |
choice.fcn |
Function for customized choice function. Must take as input the numeric vector of ilr coefficients |
cluster.depends |
Character parsed and evaluated by cluster to load all dependencies for custom choice.fcn. e.g. |
... |
optional input arguments for |
Phylofactor object, a list containing: "Data", "tree" - inputs from phylofactorization. Output also includes "factors","glms","terminated" - T if stop.fcn terminated factorization, F otherwise - "bins", "bin.sizes", "basis" - basis for projection of data onto phylofactors, and "Monophyletic.Clades" - a list of which bins are monophyletic and have bin.size>1. For customized choice.fcn
, Phylofactor outputs $custom.output
.
set.seed(2)
library(phylofactor)
library(phangorn)
library(mgcv)
mar <- par('mar')
clo <- function(X) X/rowSums(X)
## Example with pseudo-simulated data: real tree with real taxonomy, but fake abundance patterns.
data("FTmicrobiome")
tree <- FTmicrobiome$tree
Taxonomy <- FTmicrobiome$taxonomy
tree <- drop.tip(tree,setdiff(tree$tip.label,sample(tree$tip.label,20)))
### plot phylogeny ###
plot.phylo(tree,use.edge.length=FALSE,main='Community Phylogeny')
nodelabels()
Taxonomy <- Taxonomy[match(tree$tip.label,Taxonomy[,1]),]
X <- as.factor(c(rep(0,5),rep(1,5)))
### Simulate data ###
Factornodes <- c(37,27)
Factoredges <- sapply(Factornodes,FUN=function(n,tree) which(tree$edge[,2]==n),tree=tree)
edgelabels(c('PF 1','PF 2'),edge=Factoredges,cex=2,bg='red')
sigClades <- Descendants(tree,Factornodes,type='tips')
Data <- matrix(rlnorm(20*10,meanlog = 8,sdlog = .5),nrow=20)
rownames(Data) <- tree$tip.label
colnames(Data) <- X
Data[sigClades[[1]],X==0] <- Data[sigClades[[1]],X==0]*8
Data[sigClades[[2]],X==1] <- Data[sigClades[[2]],X==1]*9
Data <- t(clo(t(Data)))
Bins <- bins(G=sigClades,set=1:20)
pf.heatmap(tree=tree,Data=Data)
### PhyloFactor ###
PF <- PhyloFactor(Data,tree,X,nfactors=2)
PF$bins
all(PF$bins %in% Bins)
######### Summary tools ##########
PF$factors
# Notice that both of the groups at the first factor are labelled as "Monophyletic"
# due to the unrooting of the tree
PF$ExplainedVar
# A coarse summary tool
s <- pf.summary(PF,Taxonomy,factor=1)
s$group1$IDs # Grabbing group IDs
s$group2$IDs
# A tidier summary tool
td <- pf.tidy(s)
td$`group1, Monophyletic`
# Simplified group IDs - the unique shortest unique prefixes separating the groups
td$`group2, Monophyletic`
## Plotting with summary tools ##
par(mfrow=c(1,1),mar=mar)
plot(as.numeric(X),td$`Observed Ratio of group1/group2 geometric means`,
ylab='Average ratio of Group1/Group2',pch=18,cex=2)
lines(td$`Predicted ratio of group1/group2`,lwd=2)
legend(1,12,legend=c('Observed','Predicted'),pch=c(18,NA),lwd=c(NA,2),
lty=c(NA,1),cex=2)
######### get and plot Phylogenetic info ####
PFedges <- getFactoredEdgesPAR(ncores=2,PF=PF) %>% unlist
## unlisting is unwise if any factor corresponds to more than one edge
PFnodes <- tree$edge[PFedges,2]
PFclades <- Descendants(tree,PFnodes,'tips')
par(mfrow=c(3,1))
pf.heatmap(tree=tree,Data=Data)
# edgelabels(c('Factor 1','Factor 2'),edge=PFedges,bg=c('yellow','red'),cex=2)
tiplabels(' ',PFclades[[1]],bg='yellow')
tiplabels(' ',PFclades[[2]],bg='red')
edgelabels(c('PF1','PF2'),edge=PFedges,bg=c('yellow','red'),cex=2)
### predicted data matrix given phylofactors
pred <- pf.predict(PF)
colnames(pred) <- colnames(Data)
pf.heatmap(tree=tree,Data=pf.predict(PF))
### residual data
resid <- Data/pred
resid <- resid %>% t %>% clo %>% t
pf.heatmap(tree=tree,Data=resid)
par(mar=mar)
##################################
##################################################
############### Other features: ##################
#### glm-style manipulation of formula, weights, etc. #########
#w=1:10
#PF.weighted <- PhyloFactor(Data,tree,X,weights=w,nfactors=1)
# predict meta-data with ILR abundances by changing formula & family
# PF.predict.X <- PhyloFactor(Data,tree,X,frmla=X~Data,nfactors=2,family=binomial)
### more glm controls: offset, model, subset...
#PF.fancy <- PhyloFactor(Data,tree,X,frmla=X~Data,nfactors=2,ncores=2,
#family=binomial,weights=w,offset=rnorm(10),model=FALSE,subset=3:8)
#### Stopping Function ###########################
PF.stop <- PhyloFactor(Data,tree,X,stop.early=TRUE)
PF.stop$terminated
# TRUE - this indicates that the factorization was terminated
# when there was sufficiently low signal
PF.stop$nfactors # 2 - the correct number of factors
all(PF.stop$bins %in% Bins) #
# TRUE - the factors identified were the correct ones.
#### PhyloFactor has built-in parallelization ####
PF.par <- PhyloFactor(Data,tree,X,nfactors=2,ncores=2)
all.equal(PF$factors,PF.par$factors)
##################################################
######### Phylogenetic PCA - maximize variance ###
pf.var <- PhyloFactor(Data,tree,method='max.var',nfactors=2)
######### Multiple regression ####################
b <- rlnorm(ncol(Data))
a <- as.factor(c(rep(0,5),rep(1,5)))
X <- data.frame('a'=a,'b'=b)
frmla <- Data~a+b
PF.M <- PhyloFactor(Data,tree,X,frmla=frmla,nfactors=2)
PF.M$models[[1]]
PF.M.par <- PhyloFactor(Data,tree,X,frmla=frmla,nfactors=2,ncores=2)
all.equal(PF.M$factors,PF.M.par$factors)
####### transform.fcn and contrast.fcn ###########
## If we had Gaussian or approximately Gaussian data,
#GausData <- log(Data)
#pf.gaussian <- PhyloFactor(GausData,tree,X,frmla=frmla,nfactors=2,transform.fcn=I)
## We can also perform amalgamation-style analyses with contrast.fcn
#pf.amalg <- PhyloFactor(GausData,tree,X,frmla=frmla,
# nfactors=2,transform.fcn=I,contrast.fcn=amalgamate)
##################################################
############################# CUSTOMIZED CHOICE FUNCTIONS ################################
#PhyloFactor can also be used for generalized additive models by inputting choice.fcn
#and cluster.depends to load required packages onto the cluster
### Let's work with some newly simulated data ####
set.seed(1.1)
n=100
Data <- matrix(rlnorm(20*n,meanlog = 8,sdlog = .5),nrow=20)
rownames(Data) <- tree$tip.label
a <- rnorm(n)
b <- rnorm(n)
X <- data.frame(a,b)
Data[sigClades[[1]],] <- t(t(Data[sigClades[[1]],])*(20/(1+exp(5*b))))
## This clade has a nonlinear response with b, decreasing for high values of b.
Data[sigClades[[2]],] <- t(t(Data[sigClades[[2]],])*8*a^-2)
## this clade is abundant only for intermediate values of a.
Data <- t(clo(t(Data)))
par(mfrow=c(2,2))
plot(a,gMean(Data[sigClades[[1]],],MARGIN=2),ylab='Group1 gMean')
plot(b,gMean(Data[sigClades[[1]],],MARGIN=2),ylab='Group1 gMean')
plot(a,gMean(Data[sigClades[[2]],],MARGIN=2),ylab='Group2 gMean')
plot(b,gMean(Data[sigClades[[2]],],MARGIN=2),ylab='Group2 gMean')
######### To input a custom choice.fcn, it needs to take as input the vector of
######### ILR coefficients 'y', the input meta-data 'X', and a logical PF.output.
######### The output of the custom choice function when PF.output=T
######### will be returned in PF$custom.output.
## Demo choice.fcn - generalized additive modelling ##
my_gam <- function(y,X,PF.output=FALSE,...){
dataset <- cbind('Data'=y,X)
gg <- mgcv::gam(Data~s(a)+s(b),data=dataset,...)
if (PF.output){
return(gg)
break
} else {
output <- NULL
## The output of the choice function for PF.output=F must contain two labelled numerics:
## an "objective" statistic and a "stopStatistics".
output$objective <- getStats(gg)['ExplainedVar']
output$stopStatistics <- getStats(gg)['Pval']
return(output)
}
}
load.mgcv <- 'library(mgcv)'
######### For parallelization of customized choice function, we may also need to input
######### cluster.depends which loads all dependencies to cluster.
######### The exact call will be clusterEvalQ(cl,eval(parse(text=cluster.depends)))
PF.G.par <- PhyloFactor(Data,tree,X,choice.fcn=my_gam,sp=c(1,1),
cluster.depends = load.mgcv,nfactors=2,ncores=2)
######### Or we can use the built-in method='gam' and input e.g. smoothing penalty sp
PF.G.par2 <- PhyloFactor(Data,tree,X,method='gam',
frmla=Data~s(a)+s(b),sp=c(1,1),nfactors=2,ncores=2)
all(sigClades %in% PF.G.par$bins)
PF.G.par$factors
par(mfrow=c(1,2))
for (ff in 1:2){
gm <- PF.G.par$custom.output[[ff]]
grp <- PF.G.par$groups[[ff]]
if (ff==1){
x=b
nd <- X
nd$a <- rep(mean(a),length(a))
pred <- predict(gm,newdata = nd)
} else {
x=a
nd <- X
nd$b <- rep(mean(b),length(b))
pred <- predict(gm,newdata = nd)
}
y <- BalanceContrast(grp,log(Data))
plot(sort(x),y[order(x)],ylab='ILR Coefficient',
xlab='dominant Independent Variable',
main=paste('Factor',toString(ff),sep=' '))
lines(sort(x),pred[order(x)])
}
######################## Finding Hutchisonian Niches #####################################
### Example of how to use PhyloFactor to identify Gaussian-shapped Hutchinsonian niches ###
set.seed(1)
n=1000
A <- 20
mu=-1
sigma=0.9
Data <- matrix(rlnorm(20*n,meanlog = 8,sdlog = .5),nrow=20)
rownames(Data) <- tree$tip.label
X <- rnorm(n)
Data[sigClades[[1]],] <- t(t(Data[sigClades[[1]],])*A*exp(-(((X-mu)^2)/(2*sigma^2))))
Data <- t(clo(t(Data)))
y1 <- gMean(Data[sigClades[[1]],],MARGIN=2)
y2 <- gMean(Data[setdiff(1:20,sigClades[[1]]),],MARGIN=2)
ratios <- y1/y2
par(mfrow=c(1,1))
plot(X,ratios,
ylab='Group1/Group2 gMean',log='y',
main='Identifying Gaussian-shaped Hutchinsonian Niches',
xlab='Environmental Variable')
frmla=Data~X+I(X^2)
PF.Gaus <- PhyloFactor(Data,tree,frmla=frmla,X,nfactors=1,ncores=2)
all.equal(sigClades[[1]],PF.Gaus$bins[[2]])
y <- PF.Gaus$groups[[1]] %>% BalanceContrast(.,log(Data))
plot(X,y,ylab='Group1/Group2 gMean',
main='Identifying Gaussian-shaped Hutchinsonian Niches',
xlab='Environmental Variable')
lines(sort(X),predict(PF.Gaus$models[[1]])[order(X)],lwd=4,col='green')
legend(-2.5,-3,legend=c('Observed','Predicted'),
pch=c(1,NA),col=c('black','green'),lty=c(NA,1),lwd=c(NA,2))
### Because the regression is performed on an ILR coordinate, getting an estimate
### about the optimal habitat preference and the width of habitat preferences
### requires a little algebra
grp <- PF.Gaus$groups[[1]]
r <- length(grp[[1]])
s <- length(grp[[2]])
coefs <- PF.Gaus$models[[1]]$coefficients
a <- coefs['I(X^2)']
b <- coefs['X']
c <- coefs['(Intercept)']
d <- sqrt(r*s/(r+s))
sigma.hat <- sqrt(-d/(2*a))
mu.hat <- -b/(2*a)
A.hat <- exp(c/d+mu.hat^2/(2*sigma.hat^2))
names(A.hat) <- NULL
names(mu.hat) <- NULL
names(sigma.hat) <- NULL
c('A'=A,'A.hat'=A.hat)
c('mu'=mu,'mu.hat'=mu.hat)
#The optimal environment for this simulated organism is mu=-1
c('sigma'=sigma,'sigma.hat'=sigma.hat) #The standard deviation is ~0.9.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.