# #############################################
# Author: Manuel Oviedo de la Fuente
# November, 2017 (v1) - July, 2019 (v2)
# R code for manuscrit: "Determining optimum wavelengths for leaf water
# content estimation from reflectance: a distance correlation approach"
##################################
# Find local maxima of x
# Returns the set of locations of the local maxima
localMaxima <- function(x) {
y <- diff(c(-.Machine$integer.max, x)) > 0L
rle(y)$lengths
y <- cumsum(rle(y)$lengths)
y <- y[seq.int(1L, length(y), 2L)]
if (x[[1]] == x[[2]]) {
y <- y[-1]
}
y
}
##################################
##################################
# Arguments
# y: name of the response variable
# covar: vector with the names of the predictor variables
# data: data frame
# We compute the Distance correlation (and t-test) of multivariate and functional independence
# (wrapper functions of energy package).
# tol: Tolerance values for the selection criteria. The variables whose distance correlation
# values are greater than the tolerance are selected.
# pvalue: p-value of the t-test.
# plot: logical value, if TRUE plots the distance correlation curve for each covariate in multivariate case
# and in each discretization points (argvals) in the functional case.
# smo: logical value, If true the distance correlation function is smoothed.
# using b-spline representation with a suitable number of basis elements.
# local.pc: not used in the manuscript
# ceros.pc: not used in the manuscript
# verbose: print iterative and relevant steps of the procedure.
# Return a list of two elements:
# a) cor: the value of distance correlation for eahc covariate
# b) maxLocal: index or locations of local maxima distance correlations
#' Impact points selection of functional predictor and regression using local
#' maxima distance correlation (LMDC)
#'
#' LMDC.select function selects impact points of functional predictior using
#' local maxima distance correlation (LMDC) for a scalar response given.\cr
#' LMDC.regre function fits a multivariate regression method using the selected
#' impact points like covariates for a scalar response.
#'
#' String of characters corresponding to the name of the regression method
#' called.
#' Model available options:
#' \itemize{
#' \item \code{"lm"}: Step-wise lm regression model (uses lm function, stats
#' package). Recommended for linear models, test linearity using.
#' \code{\link{flm.test}} function.
#' \item \code{"gam"}: Step-wise gam regression model (uses gam function, mgcv package).
#' Recommended for non-linear models.
#' }
#' Models that use the indicated function of the required package:
#' \itemize{
##' \item \code{"svm"}: Support vector machine (svm function, e1071 package).
#' \item \code{"knn"}: k-nearest neighbor regression (knnn.reg function, FNN package).
#' \item \code{"lars"}: Least Angle Regression using Lasso (lars function, lars
#' package).
#' \item \code{"glmnet"}: Lasso and Elastic-Net Regularized Generalized Linear Models
#' (glmnet and cv.glmnet function, glmnet package).
#' \item \code{"rpart"}: Recursive partitioning for regression a (rpart function, rpart package).
#' \item \code{"flam"}: Fit the Fused Lasso Additive Model for a Sequence of Tuning
#' Parameters (flam function, flam package).
#' \item \code{"novas"}: NOnparametric VAriable Selection (code available in
#' \url{https://www.math.univ-toulouse.fr/~ferraty/SOFTWARES/NOVAS/novas-routines.R}).
#' \item \code{"cosso"}: Fit Regularized Nonparametric Regression Models Using COSSO
#' Penalty (cosso function, cosso package).
#' \item \code{"npreg"}: kernel regression estimate of a one (1) dimensional dependent
#' variable on p-variate explanatory data (npreg function, np package).
#' \item \code{"mars"}: Multivariate adaptive regression splines (mars function, mda
#' package).
#' \item \code{"nnet"}: Fit Neural Networks (nnet function, nnet package).
#' \item \code{"lars"}: Fits Least Angle Regression, Lasso and Infinitesimal Forward
#' Stagewise regression models (lars function, lars package).
#' }
#'
#' @aliases LMDC.select LMDC.regre
#' @param y name of the response variable.
#' @param covar vector with the names of the covaviables (or points of impact)
#' with length \code{p}.
#' @param data data frame with length n rows and at least p + 1 columns,
#' containing the scalar response and the potencial p covaviables (or points of
#' impact) in the model.
#' @param tol Tolerance value for distance correlation and imapct point.
#' @param pvalue pvalue of bias corrected distance correlation t-test.
#' @param plot logical value, if TRUE plots the distance correlation curve for
#' each covariate in multivariate case and in each discretization points
#' (argvals) in the functional case.
#' @param local.dc Compute local distance correlation.
#' @param smo logical. If TRUE, the curve of distance correlation computed in
#' the impact points is smoothed using B-spline representation with a suitable
#' number of basis elements.
#' @param verbose print iterative and relevant steps of the procedure.
#' @param newdata An optional data frame in which to look for variables with
#' which to predict.
#' @param method Name of regression method used, see details. This argument is
#' used in do.call function like "what" argument.
#' @param par.method List of parameters used to call the method. This argument
#' is used in do.call function like "args" argument.
#' @return
#' \item{\code{LMDC.select}}{ function returns a list of two elements:}
#' \item{\code{cor}}{ the value of distance correlation for each covariate.}
#' \item{\code{maxLocal}}{ index or locations of local maxima distance correlations.}
#' \item{\code{LMDC.regre}}{ function returns a list of folowing elements:}
#' \item{\code{model}}{ object corresponding to the estimated method using the selected variables.}
#' \item{\code{xvar}}{ names of selected variables (impact points).}
#' \item{\code{edf}}{ effective degrees of freedom.}
#' \item{\code{nvar}}{ number of selected variables (impact points).}
#'
#' @author Manuel Oviedo de la Fuente \email{manuel.oviedo@@udc.es}
#' @seealso See Also as: \link[stats]{lm}, \link[mgcv]{gam},
#' \code{\link{dcor.xy}}.
#' @references Ordonez, C., Oviedo de la Fuente, M., Roca-Pardinas, J.,
#' Rodriguez-Perez, J. R. (2018). Determining optimum wavelengths for leaf
#' water content estimation from reflectance: A distance correlation approach.
#' \emph{Chemometrics and Intelligent Laboratory Systems}. 173,41-50
#' \doi{10.1016/j.chemolab.2017.12.001}.
#' @keywords regression
# @importFrom mgcv gam gam.fit bam s te t2
#' @examples
#' \dontrun{
#' data(tecator)
#' absorp=fdata.deriv(tecator$absorp.fdata,2)
#' ind=1:129
#' x=absorp[ind,]
#' y=tecator$y$Fat[ind]
#' newx=absorp[-ind,]
#' newy=tecator$y$Fat[-ind]
#'
#' ## Functional PC regression
#' res.pc=fregre.pc(x,y,1:6)
#' pred.pc=predict(res.pc,newx)
#'
#' # Functional regression with basis representation
#' res.basis=fregre.basis.cv(x,y)
#' pred.basis=predict(res.basis[[1]],newx)
#'
#' # Functional nonparametric regression
#' res.np=fregre.np.cv(x,y)
#' pred.np=predict(res.np,newx)
#'
#' dat <- data.frame("y"=y,x$data)
#' newdat <- data.frame("y"=newy,newx$data)
#'
#' res.gam=fregre.gsam(y~s(x),data=list("df"=dat,"x"=x))
#' pred.gam=predict(res.gam,list("x"=newx))
#'
#' dc.raw <- LMDC.select("y",data=dat, tol = 0.05, pvalue= 0.05,
#' plot=F, smo=T,verbose=F)
#' covar <- paste("X",dc.raw$maxLocal,sep="")
#' # Preselected design/impact points
#' covar
#' ftest<-flm.test(dat[,-1],dat[,"y"], B=500, verbose=F,
#' plot.it=F,type.basis="pc",est.method="pc",p=4,G=50)
#'
#' if (ftest$p.value>0.05) {
#' # Linear relationship, step-wise lm is recommended
#' out <- LMDC.regre("y",covar,dat,newdat,pvalue=.05,
#' method ="lm",plot=F,verbose=F)
#' } else {
#' # Non-Linear relationship, step-wise gam is recommended
#' out <- LMDC.regre("y",covar,dat,newdat,pvalue=.05,
#' method ="gam",plot=F,verbose=F) }
#'
#' # Final design/impact points
#' out$xvar
#'
#' # Predictions
#' mean((newy-pred.pc)^2)
#' mean((newy-pred.basis)^2)
#' mean((newy-pred.np)^2)
#' mean((newy-pred.gam)^2)
#' mean((newy-out$pred)^2)
#' }
#' @name LMDC.select
#' @rdname LMDC.select
#' @export
LMDC.select <- function(y, covar, data, tol = .06, pvalue = .05,
plot = FALSE, local.dc = TRUE,
smo = FALSE, verbose = FALSE){
yy <- data[,y]
if (missing(covar)) covar=names(data)#setdiff(names(data),y)
covar <- setdiff(covar,y)
xx<-data[,covar,drop=F]
nn<-ncol(xx)
dc<-numeric(nn)
for (i in 1:nn){
a<-dcor.xy(xx[,i,drop=F],yy)
dc[i]<-a$estimate
if (verbose) cat(i,a$estimate,a$p.value,a$p.value<pvalue & a$estimate>tol,"\n")
}
if (smo) {
nbase<-ifelse(nn<50,floor(nn/2),floor(nn^(4/5)))
#print(nbase)
dc1<-fdata2fd(fdata(dc),nbasis=nbase)
dc2<-fdata(dc1,1:nn)$data[1,]
}
else dc2<-dc
regre<-TRUE #REGRESION O CLASIFICACION
if (is.factor(yy)) regre<-FALSE
#if(local.pc) pc<-fregre.pc.cv(fdata(xx),yy)[[1]]
maxLocal<-max.pc1<-max.pc2<-max.pc3<-NULL
if (local.dc) maxLocal<-localMaxima(dc2)
#if (local.pc) max.pc1<-localMaxima(pc$beta.est$data[1,])
#if (ceros.pc) max.pc3<- c(1,which(diff(pc$beta.est$data[1,]>0)!=0))
maxLocal<-unique(c(maxLocal,max.pc1,max.pc3))
maxLocal2<-intersect(which(dc2>tol),maxLocal)
xorder<-order(dc2[maxLocal2],decreasing =TRUE)
dc2<-dc2[maxLocal2][xorder]
maxLocal2<-maxLocal2[xorder]
if (plot){
par(mfrow=c(1,1))
plot(dc2)
lines(dc,col=4)
abline(v=maxLocal,col=2)
abline(v=maxLocal2,col=3,lwd=2)
}
nvar<-length(maxLocal2)
names(dc)<-covar
return(list(dcor=dc,maxLocal=maxLocal2))
}
#######################################################
#######################################################
# Arguments
# y: name of the response variable
# covar: vector with the names of the predictor variables
# data: data frame for model estimation (data train)
# newdata: (optional) new data frame for model prediction (data test)
# We compute the Distance correlation (and t-test) of multivariate and functional independence
# (wrapper functions of energy package).
# tol: Tolerance values for the selection criteria. The variables whose distance correlation
# values are greater than the tolerance are selected.
# pvalue: p-value of covariate significance (used in "lm" and "gam" methods).
# method: character, name of the regression method called. Model options
# "lm": step-wise lm regression model (lm function, stats package)
# "gam": step-wise gam regression model (gam function, mgcv package).
# "svm": Support vector machine (svm function, e1071 package).
# "knn": k-nearest neighbor regression (knnn.reg function,F NN package).
# "lars": Least Angle Regression using Lasso (lars function, lars package).
# "glmnet": Lasso and Elastic-Net Regularized Generalized Linear Models (glmnet and cv.glmnet function, glmnet package).
# "rpart": Recursive partitioning for regression a (rpart function, rpart package).
# par.method: list of arguments related with the regression method called.
# plot: logical value, not implemented
# verbose: print iterative and relevant steps of the procedure.
# Return a list of two elements:
# a) model: final estimated model
# b) xvar: not implemented
# c) pred: (optional) vector with the predicted values
# d) nvar: number of covariates
#######################################################
#######################################################
#' @rdname LMDC.select
#' @export
LMDC.regre <- function(y,covar,data,newdata,pvalue=.05,
method="lm", par.method = NULL,
plot=FALSE,verbose=FALSE){
edf<-Inf
nvar <- length(covar)
if (missing(newdata)) pred <- FALSE
else pred <- TRUE
if (is.null(covar)){
return(list(model=lm(data[,y]~ 1,data=data), xvar=xvar, pred=rep(mean(data[,y]),len=nrow(newdata))) )
}
pred0 <- NULL
nvar <- length(covar)
#xnames<-colnames(xx)[maxLocal2]
xnames <- covar
xvar<-NULL
ff<-paste(y,"~1")
if (method == "lm"){
#model0<-lm(formula(ff),data=data)
par.method$formula <- formula(ff)
par.method$data <- data
model0<-do.call(method,par.method)
for (i in 1:nvar) {
if (verbose) print(i)
xentra<-xnames[i]
xvar2<- c(xvar,xentra)
ff<-paste(y,"~",paste(xvar2,collapse="+"),collapse="")
if (verbose) {print("lm"); print(1);print(ff)}
par.method$formula <- formula(ff)
model <- do.call(method,par.method)
if (rev(summary(model)$coefficients[,"Pr(>|t|)"])[1]<pvalue){
if (verbose) { print("entra");print(xentra);print(summary(model))}
model0 <- model
xvar <- xvar2
}
} # fin for
edf <- summary(model0)$df[1]
nvar<-edf-1
} #fin lm
if (method == "gam"){
if (!is.null(par.method$k)) {
ik<-which(names(par.method)=="k")
print(par.method)
k<-par.method$k
par.method<-par.method[-ik]
}
else k <- 4
ff<-as.formula(ff)
model0 <- mgcv::gam(ff,data=data)
par.method2 <- list("formula"=ff,"data"= data)
par.method<-c(par.method2,par.method)
for (i in 1:nvar) {
if (verbose) print(i)
xentra<-xnames[i]
xvar2<- c(xvar,xentra)
ff<-as.formula(paste(y,"~",paste("s(",xvar2,",k=",k,")",collapse="+"),collapse=""))
par.method$formula<-ff
if (verbose) {
print("gam");
#print(1);print(ff)
}
model <- do.call(method,par.method)
if (rev(summary(model)$s.table[,"p-value"])[1]<pvalue){
if (verbose) { print("entra");print(xentra);print(summary(model))}
model0 <- model
xvar <- xvar2
}
} # fin for
edf<- sum(model0$edf)
nvar <- ncol(model0$model)-1
} #fin Gam
######################################################
if (method == "svm"){
if (is.null(par.method)) par.method=list("cost"=100,"gamma"=1,"kernel"="linear")
par.method$x <- data[,covar,drop=F]
par.method$y <- data[,"y"]
model0 <- do.call(method,par.method)
}
if (method == "rpart"){
ff<-as.formula(paste(y,"~",paste(covar,collapse="+"),collapse=""))
par.method$formula <- ff
par.method$data <- data
model0 <- do.call(method,par.method)
}
if (method == "knn"){
par.method$train <- data[,covar,drop=F]
par.method$test <- newdata[,covar,drop=F]
par.method$y <- data[,"y"]
model0 <- do.call("knn.reg",par.method)
pred0<-model0$pred
}
if ( method =="lars") {
if (is.null(par.method))
par.method= list(type="lasso",normalize=FALSE,intercept = TRUE,use.Gram=FALSE)
x0<-as.matrix(data[,covar])
par.method$x <- x0
par.method$y <- data[,"y"]
model0 <- do.call(method, par.method)
templam <- range(model0$lambda)
lambda<-seq(templam[1], templam[2], length=200)
cv <- do.call("cvlars",
list(x=x0, y=data[,"y"],K=10,lambda=lambda, trace = FALSE,
intercept = TRUE,normalize=FALSE, type="lasso",use.Gram=F))
#cv <- cvlars(x=x0, y=data[,"y"],K=10,lambda=lambda, trace = FALSE,
# intercept = TRUE,normalize=FALSE, type="lasso",use.Gram=F)
minl<-lambda[which.min(cv)]
pred0 <- do.call("predict.lars",
list("object" = model0, "newx" = as.matrix(newdata[,covar]),
"s" = minl,"type"= "fit", "mode"= "lambda"))$fit
#pred0 <- predict.lars( model0 ,newx=as.matrix(newdata[,covar]), s=minl,type="fit",mode="lambda")$fit
cv <- do.call("lars::cv.lars",list("x" = x0, "y" = data[,"y"],"K" = 10))
#cv <- lars::cv.lars(x0, data[,"y"],K=10)
# pred0 <- predict.lars( cv ,newx=as.matrix(newdata[,covar]), s=minl,type="fit",mode="lambda")$fit
ideal_l1_ratio <- cv$index[which.max(cv$cv - cv$cv.error <= min(cv$cv))]
#obj <- lars::lars(x0, data[,"y"])
obj <- do.call("lars::lars",list("x"=x0, "y"=data[,"y"]))
scaled_coefs <- scale(obj$beta, FALSE, 1 / obj$normx)
l1 <- apply(X = scaled_coefs, MARGIN = 1, FUN = function(x) sum(abs(x)))
coef(obj)[which.max(l1 / do.call("tail",list("x"=l1, "n"=1)) > ideal_l1_ratio),]
pred0 <- do.call("predict.lars",list("object"= model0,
"newx"=as.matrix(newdata[,covar]),
"s"=minl,"type"="fit","mode"="lambda"))$fit
#pred0 <- predict.lars( model0 ,newx=as.matrix(newdata[,covar]), s=minl,type="fit",mode="lambda")$fit
nvar <- sum( coef(obj)[which.max(l1 / do.call("tail",list("x"=l1, "n"=1)) > ideal_l1_ratio),]>0)
}
if ( method =="glmnet") {
#x should be a matrix with 2 or more columns
x0 <- as.matrix(data[,covar,drop=F])
newx0 <- as.matrix(newdata[,covar,drop=F])
if (ncol(x0)==1){
x0<-cbind(x0,1:nrow(x0))
newx0<-cbind(newx0,1:nrow(newx0))
}
if (is.null(par.method))
par.method= list(family="gaussian", standardize=TRUE, nfolds=10)
par.method$x <- x0
par.method$y <- data[,"y"]
model0 <- do.call("cv.glmnet",par.method)
pred0<-predict(model0,newx=newx0, s=model0$lambda.min)
#min.scale<-which.min(model0$cvm)
#min.lambda<-model0$lambda.min
#temp<-coef( model0$glmnet.fit, s=min.lambda)
#temp<-temp[-1]
#ntheta1<-rep(0,length(coef(model0)))
#ntheta1[which(abs(temp) > 0)]<-temp[which(abs(temp) > 0)]
#edf <- length(which(abs(ntheta1)>0))
c<-coef(model0 ,s='lambda.min',exact=TRUE)
inds<-which(c!=0)
variables<-row.names(c)[inds]
variables<-setdiff(variables,'(Intercept)')
nvar<- length(variables)
edf <-nvar +1
}
if ( method =="nnet") {
if (is.null(par.method))
par.method<-list( size = 5, rang = .1,decay = 5e-6, maxit =1000,linout=T)
par.method$x <- data[,covar,drop=F]
par.method$y <- data[,"y"]
model0 <- do.call("nnet",par.method)
}
if ( method =="mars") {
#if (is.null(par.method))
# par.method<-list( size = 5, rang = .1,decay = 5e-6, maxit =1000,linout=T)
par.method$x <- data[,covar,drop=F]
par.method$y <- data[,"y"]
model0 <- do.call("mars",par.method)
}
if (method == "npreg"){
#ff<-as.formula(paste(y,"~",paste(covar,collapse="+"),collapse=""))
#par.method$formula <- ff
#par.method$data <- data
par.method$txdat<-data[,covar,drop=F]
par.method$tydat<-data[,"y"]
model0 <- do.call(method,par.method)
}
if (method == "flam"){
#if (is.null(par.method))
# par.method=list("alpha" = 0.75, n.fold = 2)
par.method$x <- data[,covar,drop=F]
par.method$y <- data[,"y"]
model0 <- do.call("flamCV",par.method)
alpha <- model0$alpha
lambda <- model0$lambda.cv
pred0<- predict(model0$flam.out, new.x =newdata[,covar,drop=F],
lambda = lambda, alpha = alpha)
}
if (method == "novas"){
#if (is.null(par.method))
# par.method=list("alpha" = 0.75, n.fold = 2)
par.method$COVARIATES<- data[,covar,drop=F]
par.method$Responses <- data[,"y"]
model0 <- do.call("novas",par.method)
pred0<- predict(model0, newdata[,covar,drop=F])
nvar <- edf <- length(model0$model)
}
if ( method =="cosso") {
#x should be a matrix with 2 or more columns
x0 <- as.matrix(data[,covar,drop=F])
newx0 <- as.matrix(newdata[,covar,drop=F])
if (ncol(x0)==1){
x0<-cbind(x0,1:nrow(x0))
newx0<-cbind(newx0,1:nrow(newx0))
}
model0<-do.call("cosso",list("x"=x0,"y"=data[,y],"family"="Gaussian"))
xvar<-do.call("predict.cosso",
list("object"=model0,"M"=2,"type"="nonzero"))
pred0<-do.call("predict.cosso",list("object"=model0,"xnew"=newx0,
"M"=2,"type"="fit"))
nvar <- length(xvar)
edf <-nvar+1
}
if (method != "knn" & method != "cosso" & method != "lars" & method != "novas" & method != "glmnet" & method != "flam"){
#print(method); print(pred)
if (pred) pred0 <- predict(model0, newdata=newdata[,covar,drop=F])
else pred0 <- NULL
}
return(list(model=model0, xvar=xvar, pred=pred0,edf=edf,nvar=nvar))
}
################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.