test=F
#' a relatively generic function to fit multiple regression with glmmTMB, scaling numeric variables, supporting interactions and conversion of numeric variables to factors
#' @param dataFile data file location
#' @param varInterest the name of the variable to be regressed (a character value)
#' @param listEffects the names of the explanatory variables (a character vector). Polynomial effects should be written as "poly(x,n)"
#' @param interactions a list of numeric vectors giving the position of the explanatory variables composing interactions (TO DO : adapt script to give variable names instead), default to NA (= no interactions)
#' @param formulaRandom the random part of the formula starting by "+" (default to ="+1" = no random effects)
#' @param selSample numeric, to downsample data (for testing)
#' @param tagModel a character tag identifying model outputs
#' @param family distribution family of varInterest (default to "nbinom2", probably the better choice for abundance data)
#' @param asfactor a character vector giving the numeric variables to be treated as factor in the modelling, default to NA
#' @param data by default the function load the file from dataFile, it's also possible to declare the data.frame
#' @param repout le path for the output
#' @param checkRepout TRUE to verifiy the presence of repout and create it (and some sub-repertories) if no exist
#' @param saveFig TRUE to save the figures
#' @param output to get a multiple object from the function, by default FALSE
#' @param doBeep TRUE to do a beep after model fitting
#' @param printFormula TRUE to print the formula
#' @param woInt TRUE to suppress the intercept in the model formula
#' @return write 5 files: (1) a .glm to save model fit in R format; (2) a "XXX_coefs.csv" table giving estimates and vif coefficients of the model; (3) a "XXX.log" to keep track of the formula of the model; (4) a "XXX_Res.csv" a table giving the residuals value; (5) a "forBackTransform_XXX.csv" table giving the mean and standard deviation value of numeric explanatory variables, to allow back transformation to real values when predicting
#' @example see at the end of this code
#' @author Yves Bas, Charlotte Roemer and Romain Lorrilliere
Sp_GLM_short=function(dataFile,varInterest,listEffects,
interactions=NA
,
formulaRandom="+1"
,
selSample=1e10
,
tagModel=""
,
family="nbinom2"
,
asfactor=NA
,
data=NULL
,
repout=NULL
,
checkRepout=TRUE
,
saveFig=FALSE
,
output=FALSE
,
doBeep=TRUE
,
printFormula=TRUE
,
woInt=F
)
{
library(data.table)
library(glmmTMB)
library(plyr)
if(doBeep) library(beepr)
library(corrplot)
FAct=dataFile # Variables à sélectionner et à tester en interaction
VarAbondance=varInterest
VarSimple=listEffects
##Interactions=list(c(6,7,8),c(6,7,9),c(6,8,9),c(6,4))
Interactions=interactions
FormulaRandom=formulaRandom
if(!is.na(asfactor)) {
VarAbondance <- paste0(VarAbondance,ifelse(VarAbondance %in% asfactor,"_as_factor",""))
VarSimple <- paste0(VarSimple,ifelse(VarSimple %in% asfactor,"_as_factor",""))
if(!is.na(Interactions))
for(l in 1:length(Interactions))
Interactions[[l]] <- paste0(Interactions[[l]],ifelse(Interactions[[l]] %in% asfactor,"_as_factor",""))
}
##FormulaRandom="+(1|espece)+(1|site)"
##FormulaRandom="+(1|espece)"
SelSample=selSample #for testing computing time
##variables à rajouter : bioclim1 et 11, type de détecteur
TagModel=tagModel
## Famille
familyMod=family
## Modèle minimal
##FormulaFix_TtSp="nb_contacts~(Jour+I(Jour^2)+I(Jour^3)+I(Jour^4)+I(Jour^5))*DecOT+(AT81+I(AT81^2))+((AT1+I(AT1^2))+(AT9+I(AT9^2)))+SpBioc12+SpHO1S+SpHO2S+SpHO4S+SpWS_S+SpWC_S"
if(woInt)
{
FormulaY=paste0(VarAbondance,"~-1")
}else{
FormulaY=paste0(VarAbondance,"~1")
}
FormulaXList=VarSimple
##pour afficher les milisecondes
op <- options(digits.secs=3)
## preparaton des repertoires de sortie
if(is.null(repout)) repout <- "./VigieChiro/GLMs/"
repoutSummary <- paste0(repout,"Summaries/")
repoutLogs <- paste0(repout,"logs/")
repoutBackTransform <- paste0(repout,"forBackTransform/")
repoutModel <- paste0(repout,"Model/")
if(saveFig)repfig <- paste0(repout,"forBackTransform/")
if(checkRepout) {
dir.create(repout,showWarnings=FALSE)
dir.create(repoutSummary,showWarnings=FALSE)
dir.create(repoutLogs,showWarnings=FALSE)
dir.create(repoutBackTransform,showWarnings=FALSE)
dir.create(repoutModel,showWarnings=FALSE)
if(saveFig) dir.create(repfig,showWarnings=FALSE)
}
FormulaFix_TtSp=FormulaY
for (i in 1:length(FormulaXList))
{
FormulaFix_TtSp=paste(FormulaFix_TtSp,FormulaXList[i],sep="+")
}
if(!is.na(Interactions[1]))
{
for (i in 1:length(Interactions))
{
## Intemp=paste(FormulaXList[Interactions[[i]][1]]
## ,FormulaXList[Interactions[[i]][2]],sep="*")
Intemp=paste(FormulaXList[Interactions[[i]]],collapse="*")
FormulaFix_TtSp=paste(FormulaFix_TtSp,Intemp,sep="+")
}
}
if(is.null(data))
{
SpNuit=fread(FAct)
}else{
SpNuit <- data
}
if(!is.na(asfactor))
{
SpNuit=as.data.frame(SpNuit)
for (i in 1:length(asfactor))
{
test <- match(asfactor[i],names(SpNuit))
newcolname <- paste0(colnames(SpNuit)[test],"_as_factor")
SpNuit[,newcolname] <- as.factor(SpNuit[,test])
}
SpNuit=as.data.table(SpNuit)
}
##compute summaries of activity
ColA=match(VarAbondance,names(SpNuit))
Ab=as.data.frame(SpNuit)[,ColA]
SpNuitwoNA=subset(SpNuit,!is.na(Ab))
AbwoNA=subset(Ab,!is.na(Ab))
SpA1=aggregate(AbwoNA,by=list(SpNuitwoNA$espece),FUN=mean)
if(length(unique(SpNuit$espece))>1) {
barplot(SpA1$x,names.arg=SpA1$Group.1,las=2,cex.names=0.6)
}
SpPos=subset(SpNuitwoNA,AbwoNA>0)
AbPos=subset(Ab,Ab>0)
cat("\n Number of occurence:",(length(AbPos)),"\n")
if(length(AbPos)<=length(VarSimple))
{
cat(paste(FAct,": too few positive data to fit model"),"\n")
}else
{
if(length(unique(SpNuit$espece))>1) {
SpOcc=aggregate(AbPos,by=list(SpPos$espece),FUN=length)
barplot(SpOcc$x,names.arg=SpOcc$Group.1,las=2,cex.names=0.6)
SpAbIfP=aggregate(AbPos,by=list(SpPos$espece),FUN=mean)
barplot(SpAbIfP$x,names.arg=SpAbIfP$Group.1,las=2,cex.names=0.6)
}
## Calcul du VIF (adapté à glmmTMB, sinon il faut adapter v et nam)
## adapted from rms::vif
vif.mer <- function (fit) {
## Variance-Covariance Matrix
v <- vcov(fit)$cond
nam <- names(fixef(fit)$cond)
## exclude intercepts
ns <- sum(1 * (nam == "Intercept" | nam == "(Intercept)"))
if (ns > 0) {
v <- v[-(1:ns), -(1:ns), drop = FALSE]
nam <- nam[-(1:ns)]
}
## squarre root of the diagonal matrix (of the variance-covariance matrix)
# !! doesn't work if diag(v) product negative values !!
d <- diag(v)^0.5
if(any(is.na(d))) {
cat("! the diagonal matrice of the variance covariance matrix produce at least one negative value !\n")
cat(" -> the VIFs values are not assessed\n\n")
}
## variance-covariance matrix on outer product of d
d <- v/(d %o% d)
## inversing d
d <- solve(d)
## and return the diag of d
v <- diag(d)
names(v) <- nam
v
}
## Pour correction autocorrelation spatiale
##MaDataActiNew$FauxGroupe=rep(1,nrow(MaDataActiNew))
##MaDataActiNew$Coord=numFactor(MaDataActiNew$X,MaDataActiNew$Y)
SpNuit_SLPAGN=as.data.frame(SpNuitwoNA)
#OtherVariables=subset(names(SpNuit),!(names(SpNuit) %in% VarSimple))
SpNuit_Scale=SpNuit_SLPAGN
Mean=vector()
Sdev=vector()
VarList=vector()
cat("Variable transformation to reduced centered variable\n")
cat(" ",length(VarSimple),"variable(s) to transform")
start <- Sys.time() ## heure de demarage
cat(" ",format(start, "%d-%m-%Y %HH%M")," ... \n ")
for (i in 1:length(VarSimple))
{
cat(i,"")
if(grepl("^",VarSimple[i]))
{
#Var=gsub("poly","",VarSimple[i])
#Terms=tstrsplit(Var,split=",")
#VarTemp=substr(Terms[[1]],2,nchar(Terms[[1]]))
VarTemp=tstrsplit(VarSimple[i],split="\\+")[[1]]
}else{
VarTemp=VarSimple[i]
}
VarList=c(VarList,VarTemp)
Vinit=SpNuit_SLPAGN[,VarTemp]
if(is.numeric(Vinit))
{
Vscale=scale(Vinit)
Mean=c(Mean,mean(Vinit))
Sdev=c(Sdev,sd(Vinit))
}else{
Vscale=Vinit
Mean=c(Mean,NA)
Sdev=c(Sdev,NA)
}
SpNuit_Scale[,VarTemp]=NULL
SpNuit_Scale=cbind(SpNuit_Scale,Vscale)
names(SpNuit_Scale)[ncol(SpNuit_Scale)]=VarTemp
if(i%%10==1 & i != 1){
timeAvancement <- Sys.time() ## heure de d'avancement
timeAvancement <- format(timeAvancement, "%d-%m-%Y %HH%M")
cat("\n ",timeAvancement," ... \n")}
}
cat("DONE\n")
end <- Sys.time() ## heure de fin
diff <- end-start
diff <- paste(round(diff,1),units(diff))
cat(" ", format(end, "%d-%m-%Y %HH%M")," -> ",diff,"\n\n")
forBackTransform=data.frame(cbind(VarList,Mean,Sdev))
forBackTransform=unique(forBackTransform,by="VarList")
csvfile <- paste0(repoutBackTransform,TagModel,".csv")
cat(" --> [CSV]:",csvfile)
fwrite(forBackTransform,csvfile)
cat(" DONE !\n")
ColNumTest=unlist(lapply(SpNuit_Scale[1,],FUN=function(x) is.numeric(x)))
ColNum=subset(names(SpNuit_Scale),ColNumTest)
SpNuit_ColNum=subset(SpNuit_Scale,select=ColNum)
#MatCor=cor(SpNuit_ColNum)
#if(saveFig){
# pngfile <- paste0(repfig,"corrplot","_",tagModel,".png")
# cat(" --> [PNG]:",pngfile)
#png(pngfile)
#}
#corrplot(MatCor)
#if(saveFig){
# dev.off()
#cat(" DONE !\n")
#}
Formula=as.formula(paste0(FormulaFix_TtSp
,FormulaRandom))
if(SelSample<nrow(SpNuit_Scale))
{
SpNuit_Sample=SpNuit_Scale[sample.int(nrow(SpNuit_Scale),SelSample),]
}else{
SpNuit_Sample=SpNuit_Scale
}
cat("\nModel glmmTMB\n")
start <- Sys.time() ## heure de demarage
cat(" ",format(start, "%d-%m-%Y %HH%M")," ... ")
if(printFormula) cat("\nglmmTMB(Formula= ",as.character(Formula)[2]," ",as.character(Formula)[1]," ",as.character(Formula)[3],", family= ",familyMod,")\n",sep="")
ModSp=glmmTMB(Formula,data=SpNuit_Sample, family=familyMod) #37 min
if(doBeep) beep()
end <- Sys.time() ## heure de demarage
diff <- end-start
diff <- paste(round(diff,1),units(diff))
cat(format(end, "%d-%m-%Y %HH%M")," -> ",diff,"\n\n")
Res=residuals(ModSp)
SpNuit_Sample$Res=Res
Estimates=as.data.frame(coef(summary(ModSp))$cond)
term <- row.names(Estimates)
Estimates=cbind(term=term,Estimates)
colnames(forBackTransform) <- c("term","bt_mean","bt_sd")
forBackTransform$bt_sd <- as.numeric(as.character(forBackTransform$bt_sd))
forBackTransform$bt_mean <- as.numeric(as.character(forBackTransform$bt_mean))
Estimates <- merge(Estimates,forBackTransform,by="term",all.x=TRUE)
#PROBLEM: link is not log for binomial but logit, rather extract link function in glm object itself
if(family %in% c("nbinom","nbinom2","poisson","binomial")) Estimates$coef_raw <- exp(Estimates$Estimate) else Estimates$coef_raw <- Estimates$Estimate
Estimates$coef <- ifelse(is.na(Estimates$bt_sd),Estimates$coef_raw,Estimates$coef_raw^(1/Estimates$bt_sd)) #backtransforme les coefficients de variables traitées comme numériques (pas de modif sinon)
mdIC <- as.data.frame(confint(ModSp)[,1:2])
rownames(mdIC) <- gsub("cond.","",rownames(mdIC))
colnames(mdIC) <- c("ICinf","ICsup")
mdIC <- mdIC[as.character(Estimates$term),]
Estimates <- data.frame(Estimates,mdIC)
Estimates$ICinf <- ifelse(is.na(Estimates$bt_sd),exp(Estimates$ICinf),exp(Estimates$ICinf)^(1/Estimates$bt_sd))
Estimates$ICsup <- ifelse(is.na(Estimates$bt_sd),exp(Estimates$ICsup),exp(Estimates$ICsup)^(1/Estimates$bt_sd))
glmfile <- paste0(repoutModel,TagModel,".glm")
cat(" --> [GLM]:",glmfile)
save(ModSp,file=glmfile)
cat(" DONE !\n")
if(woInt)
{
VIFMod=vif.mer(ModSp)
}else{
VIFMod=c(1,vif.mer(ModSp))
}
Estimates$VIF=VIFMod
if(FAct!="")
{
Suffix=tstrsplit(basename(as.character(FAct)),split="[.]")[[1]]
}else{
Suffix=tagModel
}
csvfile <- paste0(repoutSummary,TagModel,"_",Suffix,"_Coefs.csv")
cat(" --> [CSV]:",csvfile)
fwrite(Estimates,csvfile,sep=";")
cat(" DONE !\n")
csvfile <- paste0(repoutLogs,substr(Sys.time(),1,13),".log")
cat(" --> [CSV]:",csvfile)
fwrite(as.list(FormulaFix_TtSp),csvfile)
cat(" DONE !\n")
csvfile <- paste0(repoutModel,TagModel,"_",Suffix,"_Res.csv")
cat(" --> [CSV]:",csvfile)
fwrite(SpNuit_Sample,csvfile)
cat(" DONE !\n")
if(output) return(list(ModSp,Estimates,SpNuit_Sample))
}
}
##for test
if(test)
{
Sp_GLM_short(
dataFile="./VigieChiro/DataSp/RPCirw0_50/Pipkuh.csv"
,
varInterest="nb_contacts_strict"
,
listEffects=c("year","poly(julian,2)","sample_cat","nb_Tron_strict"
,"temps_enr_strict","latitude","longitude","expansion_direct"
)
,
interactions=NA
,
formulaRandom="+(1|site)"
,
selSample=1e10
,
tagModel="GLMalphatest_tendancesFY"
,
family="nbinom2"
,
asfactor="year"
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.