gpf | R Documentation |
Generalized phylofactorization
gpf(Data, tree, frmla.phylo = NULL, frmla = NULL,
PartitioningVariables = NULL, MetaData = NULL, nfactors = NULL,
ncores = NULL, model.fcn = stats::glm, objective.fcn = pvDeviance,
min.group.size = 1, algorithm = "mix", alpha = 0.2,
cluster.depends = "", ...)
Data |
data table containing columns of "Species", and terms in the |
tree |
phylo class object containing all species in |
frmla.phylo |
Formula used for method "phylo" and "mStable". Can use species-specific effects by multilevel factor "Species", and phylo-factors with "phylo". e.g. cbind(Successes,Failures)~x+Species*z+phylo*y has universal coefficient for x, species-specific coefficients for z, and phylo-factor contrasted coefficients for y. |
frmla |
Formula. If |
PartitioningVariables |
Character vector containing the variables in |
MetaData |
data frame or data table of meta-data containing variables found in |
nfactors |
integer for number of factors to find |
ncores |
integers for number of cores to use for parallelization |
model.fcn |
Regression function, such as glm, gam, glm.nb, gls. Must have column labelled "Deviance" in |
objective.fcn |
Optional input objective function. Takes |
min.group.size |
Minimum group size; groups with one element having fewer members than |
algorithm |
Character, either "CoefContrast", "phylo", "mStable" or "mix". "CoefContrast" will partition the standardized coefficient matrix. "phylo" will produce |
alpha |
Numeric between 0 and 1 (strictly greater than 0), indicating the top fraction of edges to use when |
cluster.depends |
Character expression input to |
... |
Additional arguments for |
phylofactor object which can be manipulated with various pf.
functions
library(phylofactor)
require(ggpubr)
ilogit <- function(eta) 1/(1+exp(-eta))
set.seed(1)
m <- 50
n <- 200
tree <- rtree(m)
MetaData <- data.table('y'=rnorm(n),
'z'=rnorm(n,sd=0.5),
'Sample'=sapply(1:n,FUN=function(s) paste('Sample',s)),
key='Sample')
#we'll partition by 'y'.
binom.size=3
clade <- phangorn::Descendants(tree,75,'tips')[[1]]
clade2 <- phangorn::Descendants(tree,53,'tips')[[1]]
######## presence/absence dataset with affected clade #######
## most species have higher P{present} with y
eta <- .5*MetaData$z+MetaData$y
p <- ilogit(eta)
M <- matrix(rbinom(m*n,binom.size,rep(p,times=m)),nrow=m,ncol=n,byrow=TRUE)
rownames(M) <- tree$tip.label
colnames(M) <- MetaData$Sample
#### the first clade decreases with y ####
eta1 <- .5*MetaData$z-MetaData$y
p1 <- ilogit(eta1)
for (species in clade){
M[species,] <- rbinom(n,binom.size,p1)
}
#### the second clade weakly decreases with y ####
eta2 <- .5*MetaData$z-.3*MetaData$y
p2 <- ilogit(eta2)
for (species in clade2){
M[species,] <- rbinom(n,binom.size,p2)
}
#### Default algorithm: 'mix' #####
### For default can input data matrix or data frame
### with "Species", "Sample", and all relevant meta-data
DF <- matrix.to.phyloframe(M,data.name='Successes')
DF[,Failures:=binom.size-Successes]
setkey(DF,Sample)
DF <- DF[MetaData]
### DF must have "Species", "Sample", and the LHS of the formula input.
### A separate data frame or data table, MetaData, can have "Sample" and the RHS of the formula.
### The default algorithm is "mix", which uses CoefContrast to limit the number of edges for
### selection by algorithm 'phylo'. This algorithm has the greatest power but is also
### computationally intensive. It's recommended that you input both frmla (used in CoefContrst)
### and frmla.phylo (used in algorithm 'phylo').
### For species-specific effects in algorithm 'phylo', you can use the variable "Species", e.g.
### frmla.phylo=cbind(Successes,Failures)~Species*z+phylo*y. For universal/shared coefficients
### for "z" across species, simply use frmla.phylo=cbind(Successes,Failures)~z+phylo*y
### Since we're modelling a constant effect of z across species, we want to offset(z)
### in the formula. Let's get the coefficients for that:
model.z <- glm(cbind(Successes,Failures)~z,family=binomial,data=DF)
DF[,z.fit:=coef(model.z)['z']*z]
pf <- gpf(DF,tree,frmla=cbind(Successes,Failures)~offset(z.fit)+y,
frmla.phylo=cbind(Successes,Failures)~offset(z.fit)+phylo*y,
PartitioningVariables='y',
family=binomial,
nfactors=2,
ncores=2)
all.equal(pf$groups[[1]][[1]],clade) & all.equal(pf$groups[[2]][[1]],clade2)
## can also use categorical variables as predictors, but notice the warning for coefficient
## contrasts when PartitioningVariables are the formula's name for that variable. gpf will grep
## for the PartitioningVariables in the names of coefficients - the warning below can be
## ignored, but formulas whose terms grep each other may require specific PartitioningVariables.
DF[,y_factor:=as.factor(y>0)]
pf_categorical <- gpf(DF,tree,frmla=cbind(Successes,Failures)~offset(z.fit)+y_factor,
frmla.phylo=cbind(Successes,Failures)~offset(z.fit)+phylo*y_factor,
PartitioningVariables='y_factor',
family=binomial,
nfactors=2,
ncores=2)
### glm manipulation - can do everything except subset ###
### For non-mStable, weights slow down the algorithm due to data.table indexing issues ###
# w <- sample(2,nrow(DF),T)
# pf.fancy <- gpf(DF,tree,frmla=cbind(Successes,Failures)~y,
# frmla.phylo=cbind(Successes,Failures)~phylo*y,
# PartitioningVariables='y',nfactors=2,ncores=2,
# family=binomial,weights=w,offset=DF$z.fit)
###########################################################
pf.non.offset <- gpf(DF,tree,frmla=cbind(Successes,Failures)~z+y,
frmla.phylo=cbind(Successes,Failures)~z+phylo*y,
PartitioningVariables='y',
family=binomial,
nfactors=2,
ncores=2)
### The difference between the offset & non-offset is that the latter re-estimates the
### coefficient for z in each partition, potentially introducing error & wasting degrees of
### freedom in downstream factors.
### Algorithms "phylo", "mix", and "mStable" have a fairly high percent of the computation which
### is parallelizable. The argument "ncores" performs built-in parallelization.
### Another algorithm is "CoefContrast". For this algorithm, you need to input the frmla and
## Partitioning Variables. CoefContrast is extremely fast and best done without parallelization
## (as it is built off matrix multiplication).
pf <- gpf(DF,tree,frmla=cbind(Successes,Failures)~z+y,
PartitioningVariables='y',
algorithm='CoefContrast',
family=binomial,
nfactors=2)
all.equal(pf$groups[[1]][[1]],clade) & all.equal(pf$groups[[2]][[1]],clade2)
####################### to partition on y, must have phylo* #########
## Also, for inputting matrices into gpf for binomial glm or gam,
## must input a list of matrices with "Successes" and "Failures":
Mat.List <- list('Successes'=M,'Failures'=binom.size-M)
pf <- gpf(Mat.List,tree,MetaData=MetaData,
frmla.phylo=cbind(Successes,Failures)~z+phylo*y,nfactors=2,
family=binomial,algorithm='mStable')
all.equal(pf$groups[[1]][[1]],clade) & all.equal(pf$groups[[2]][[1]],clade2)
observed <- pf.heatmap(tree=tree,Data=M[,order(MetaData$y)])
predicted <- pf.heatmap(tree=tree,Data=ilogit(pf.predict(pf)[,order(MetaData$y)]))
ggarrange(observed,predicted,nrow=2)
################# Poisson Regression
set.seed(1)
eta <- 2*MetaData$z+MetaData$y
lambda <- exp(eta)
M <- matrix(rpois(m*n,rep(lambda,times=m)),nrow=m,ncol=n,byrow=TRUE)
rownames(M) <- tree$tip.label
colnames(M) <- MetaData$Sample
#### the first clade decreases with y ####
eta1 <- .3*MetaData$z-MetaData$y
lambda1 <- exp(eta1)
for (species in clade){
M[species,] <- rpois(n,lambda1)
}
#### the second clade strongly increases with y ####
eta2 <- .3*MetaData$z-MetaData$y
lambda2 <- exp(eta2)
for (species in clade2){
M[species,] <- rpois(n,lambda2)
}
##For non-binomial, use "Data" as response variable #########
pf <- gpf(M,tree,MetaData=MetaData,frmla.phylo=Data~phylo*(z+y),nfactors=2,family=poisson,
PartitioningVariables='y',algorithm='mStable')
pf$factors
all.equal(pf$groups[[1]][[1]],clade) & all.equal(pf$groups[[2]][[1]],clade2)
par(mfrow=c(2,1))
observed <- pf.heatmap(tree=tree,Data=M[,order(MetaData$y)])
predicted <- pf.heatmap(tree=tree,Data=exp(pf.predict(pf)[,order(MetaData$y)]))
ggarrange(observed,predicted,nrow=2)
### partition vector of data controlling for sample effort ###
set.seed(1)
effort <- rnorm(50)
eta <- effort-3
eta[clade] <- eta[clade]+6
eta[clade2] <- eta[clade2]+8
Data <- data.table('Species'=tree$tip.label,effort,
'Successes'=rbinom(50,1,ilogit(eta)),
'Sample'=1)
Data$Failures <- 1-Data$Successes
pf <- gpf(Data,tree,frmla.phylo=cbind(Successes,Failures)~effort+phylo,
nfactors=2,algorithm='phylo',family=binomial)
all.equal(pf$groups[[1]][[1]],clade) & all.equal(pf$groups[[2]][[1]],clade2)
############# generalized additive modelling ################
set.seed(1)
m <- 50
n <- 20
tree <- rtree(m)
MetaData <- data.table('y'=rnorm(n),
'z'=rnorm(n,sd=0.5),
'Sample'=sapply(1:n,FUN=function(s) paste('Sample',s)),
key='Sample')
#we'll partition by 'y'.
binom.size=10
clade <- phangorn::Descendants(tree,75,'tips')[[1]]
clade2 <- phangorn::Descendants(tree,53,'tips')[[1]]
######## presence/absence dataset with affected clade #######
## most species have higher P{present} with y
eta <- .5*MetaData$z+MetaData$y
p <- ilogit(eta)
M <- matrix(rbinom(m*n,binom.size,rep(p,times=m)),nrow=m,ncol=n,byrow=TRUE)
rownames(M) <- tree$tip.label
colnames(M) <- MetaData$Sample
#### the first clade decreases with y ####
eta1 <- .5*MetaData$z+2*MetaData$y^2 #non-linear response
p1 <- ilogit(eta1)
for (species in clade){
M[species,] <- rbinom(n,binom.size,p1)
}
#### the second clade weakly decreases with y ####
eta2 <- .5*MetaData$z-4*MetaData$y #linear response
p2 <- ilogit(eta2)
for (species in clade2){
M[species,] <- rbinom(n,binom.size,p2)
}
DF <- matrix.to.phyloframe(M,data.name='Successes')
DF[,Failures:=binom.size-Successes]
setkey(DF,Sample)
DF <- DF[MetaData]
model.z <- glm(cbind(Successes,Failures)~z,family=binomial,data=DF)
DF[,z.fit:=coef(model.z)['z']*z]
pf <- gpf(DF,tree,frmla.phylo=cbind(Successes,Failures)~offset(z.fit)+s(y,by=phylo),
PartitioningVariables='y',family=binomial,
nfactors=2,ncores=2,model.fcn = mgcv::gam,algorithm = 'phylo')
pf$factors
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.