################################################################################
# manyglm: Fit a multivariate generalized linear model for high dimensional data
# the (default) methods coef, residuals, fitted values can be used
###############################################################################
manyglm <- function (formula, family="negative.binomial", K=1, data=NULL, subset=NULL, na.action=options("na.action"), theta.method = "PHI", model = FALSE, x = TRUE, y = TRUE, qr = TRUE, cor.type= "I", shrink.param=NULL, tol=sqrt(.Machine$double.eps), maxiter=25, maxiter2=10, show.coef=FALSE, show.fitted=FALSE, show.residuals=FALSE, show.warning=FALSE, offset, ... ) {
#if (data!=NULL) attach(data)
# tasmX <- as.matrix(tasmX, "numeric")
# get family and link
#fam <- call$family
#family.char <- as.character(fam)
if ( is.character(family) ) {
if (substr(family,1,1) == "g") {
familynum <- 0 # gaussian
familyname <- "gaussian"
}
else if (substr(family,1,1) == "p") {
familynum <- 1 #poisson
familyname <- "poisson"
}
else if (substr(family,1,1) == "n") {
familynum <- 2 #neg bin
familyname <- "negative.binomial"
}
else if (substr(family,1,1) == "b") {
familynum <- 3 #logistic/binomial
familyname <- "binomial"
}
else stop (paste("'family'", family, "not recognized"))
}
else stop ("Please specify a family function with a character string. manyglm supports the following members of the exponential family: 'gaussian', 'poisson', 'binomial', 'negative.binomial' distributions.")
ret.x <- x
ret.y <- y
ret.qr <- qr
cl <- match.call()
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "na.action", "offset"), names(mf), 0)
mf <- mf[c(1, m)]
mf$drop.unused.levels <- TRUE
mf[[1]] <- as.name("model.frame")
data <- mf <- eval(mf, parent.frame()) # Obtain the model.frame.
mt <- attr(mf, "terms") # Obtain the model terms.
offset <- as.vector(model.offset(mf))
abundances <- as.matrix(model.response(mf, "numeric"))
if (any(is.na(abundances)) & is.null(na.action))
stop("There are NA values in the response. An 'na.action' is necessary.")
if(any(is.na(abundances)) & any(as.character(na.action)=="na.pass"))
stop("There are NA values in the response. 'na.action=na.pass' cannot be used.")
N <- NROW(abundances) # eg number of sites
p <- NCOL(abundances) # number of organism types
labAbund<-colnames(abundances)
if (p==0) stop("A model without response cannot be fitted.")
else if (p==1 & is.null(labAbund))
labAbund <- deparse(attr( mt,"variables")[[2]], width.cutoff = 500)
else if (p>1 & is.null(labAbund))
labAbund <- paste(deparse(attr( mt,"variables")[[2]],width.cutoff = 500), 1:p,sep = "")
labObs <- rownames(abundances)
Y <- abundances
if (any(!is.wholenumber(Y)))
warning(paste("Non-integer data are fitted to the", familyname, "model."))
if ( familyname == "binomial" ) { # binomial distribution
if(!is.null(labAbund) & all(substr(labAbund, 1,4)%in%c("succ", "fail")))
{
p <- p/2
labAbund <- labAbund[substr(labAbund, 1,4) == "succ"]
if(length(labAbund)!=p)
stop("for each variable of the response a column of successes and ", "a column of failures \nmust be provided ",
"(marked by 'succ' and 'fail', see 'binstruc')")
}
if (all(is.wholenumber(Y)) & (length(Y[Y>1]>0)))
warning("Count data are fitted to the binomial regression. Conisder a transformation first.")
if ( (length(Y[Y<0])>0) | (length(Y[Y>1]>0)) )
warning("Data exceeds the range [0, 1].")
}
##################### BEGIN Estimation ###################
# Obtain the Designmatrix.
X <- model.matrix(mt, mf)
# Obtain regression parameters
if (familyname=="gaussian") {
stop("Please use manylm to fit Guassian")
# z <- manylm(formula, data=data, subset=subset, na.action=na.action, model=model, x=x, y =y, qr=qr, cor.type=cor.type, shrink.param=shrink.param, tol=tol, ...)
# z$family <- "gaussian"
# z$formula <- formula
# z$data <- data
# class(z) <- c("manylm", "mlm")
# return(z)
}
else {
assign <- attr(X, "assign")
tX <- t(X)
dup <- duplicated(tX)
assign <- assign[!dup]
tX <- t(tX[!dup, ]) # remove duplicated col
if ( nrow(tX) == nrow(X) ) X <- tX
else X <- t(tX) # for a single-column matrix
if (any(is.na(X)) & is.null(na.action))
stop("There are NA values in the independent variables. An 'na.action' is necessary.")
if(any(is.na(X)) & any(as.character(na.action)=="na.pass"))
stop("There are NA values in the independent variables. 'na.action=na.pass' cannot be used.")
if (N!=nrow(X))
stop("dimensions of response and independent variables in 'formula' do not match")
q <- ncol(X)
labX<-colnames(X)
# get rank
qrx <- qr(X)
rank <- qrx$rank
# the following values need to be converted to integer types
if (theta.method == "ML") methodnum <- 0
else if (theta.method == "Chi2") methodnum <- 1
else if (theta.method == "PHI") methodnum <- 2
else stop("Check the param estimation method name.")
if (show.warning==TRUE) warn <- 1
else warn <- 0
######### call Glm Fit Rcpp #########
modelParam <- list(tol=tol, regression=familynum, estimation=methodnum, stablizer=FALSE, n=K, maxiter=maxiter, maxiter2=maxiter2, warning=warn)
if(is.null(offset)) O <- matrix(0, nrow=N, ncol=p)
else if (NCOL(offset)==1) O <- matrix(rep(offset), nrow=N, ncol=p)
else O <- as.matrix(offset)
z <- .Call("RtoGlm", modelParam, Y, X, O, PACKAGE="mvabund")
if(any(z$var.est==0)) #DW change, 30/10/14
{
z$var.estimator = pmax(z$var.est,1.e-6) #1.e-6 is used because it is elsewhere in this matrix. But shouldn't it be tol?
z$residuals = (Y - z$fit) / sqrt(z$var.est)
}
# New codes added for estimating ridge parameter
if (cor.type=="shrink") {
if (is.null(shrink.param)) {
tX <- matrix(1, NROW(X), 1)
shrink.param <- ridgeParamEst(dat=z$residuals, X=tX, only.ridge=TRUE, tol=tol)$ridgeParameter
}
# to simplify later computation
if(shrink.param == 0) cor.type <- "I"
else if(shrink.param == 1) cor.type <- "R"
else if (abs(shrink.param)>1)
stop("the absolute 'shrink.param' should be between 0 and 1")
}
else if (cor.type == "I") shrink.param <- 0
else if (cor.type == "R") {
if (nrow(abundances)<=ncol(abundances))
stop("An unstructured correlation matrix should only be used if N>>number of variables.")
if(nrow(abundances) < 2*ncol(abundances))
warning("the calculated p-values might be unreliable as the number of cases is not much larger than the number of variables")
shrink.param <- 1
}
# parameters that are needed for tests in anova.manylm and summary.manylm.
dimnames(z$fitted.values) <- list(labObs, labAbund)
dimnames(z$coefficients) <- list(colnames(X), labAbund)
dimnames(z$var.coefficients) <- list(colnames(X), labAbund)
dimnames(z$linear.predictor) <- list(labObs, labAbund)
dimnames(z$residuals) <- list(labObs, labAbund)
dimnames(z$PIT.residuals) <- list(labObs, labAbund)
dimnames(z$sqrt.1_Hii) <- list(labObs, labAbund)
dimnames(z$var.estimator) <- list(labObs, labAbund)
dimnames(z$sqrt.weight) <- list(labObs, labAbund)
names(z$theta) <- labAbund
names(z$two.loglike) <- labAbund
names(z$deviance) <- labAbund
names(z$aic) <- labAbund
names(z$iter) <- labAbund
z$data <- mf
z$stderr.coefficients <- sqrt(z$var.coefficients)
dimnames(z$stderr.coefficients) <- list(colnames(X), labAbund)
z$phi <- 1/z$theta
z$tol <- tol
z$maxiter <- maxiter
z$maxiter2 <- maxiter2
z$prior.weight <- NULL
z$AICsum <- sum(z$aic)
z$family <- familyname
z$K <- K
z$theta.method<- theta.method
z$cor.type <- cor.type
z$shrink.param <- shrink.param
z$call <- cl
z$terms <- mt
z$assign <- assign
z$formula <- formula
z$rank <- rank
z$xlevels <- .getXlevels(mt, mf)
z$df.residual <- N-rank
if (model) z$model <- mf
if (ret.x) z$x <- X
if (ret.y) z$y <- abundances
if (ret.qr) z$qr <- qrx
z$show.coef <- show.coef
z$show.fitted <- show.fitted
z$show.residuals <- show.residuals
z$offset <- O
class(z) <- c("manyglm", "mglm")
return(z)
}
}
is.wholenumber <- function(x, tol = .Machine$double.eps^0.5)
{
return(abs(x - round(x)) < tol)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.