# Demonstration of penint3D functions for evaluation and prediction of 3D interaction models
# Target variable : Arsenic
# Data: Bor
library(rgdal)
library(GSIF)
library(gdalUtils)
library(raster)
library(plyr)
library(aqp)
library(psych)
library(mda)
library(classInt)
library(caret)
library(MASS)
library(splines)
library(glmnet)
library(hierNet)
library(magrittr)
library(doParallel)
library(foreach)
library(stargazer)
library(gstat)
path <- "~/Dropbox/Extensions of soil 3D trend models/Data and Scripts"
#path <- "C:/Users/Milutin/Dropbox/Extensions of soil 3D trend models/Data and Scripts"
load(paste(path,"BorData.rda",sep = "/"))
load(paste(path,"covmaps.rda",sep = "/"))
#================== Spatial references ===============================================================
gk_7 <- "+proj=tmerc +lat_0=0 +lon_0=21 +k=0.9999 +x_0=7500000 +y_0=0 +ellps=bessel +towgs84=574.027,170.175,401.545,4.88786,-0.66524,-13.24673,0.99999311067 +units=m"
utm <- "+proj=utm +zone=34 +ellps=GRS80 +towgs84=0.26901,0.18246,0.06872,-0.01017,0.00893,-0.01172,0.99999996031 +units=m"
#================== Names and abbrevations of covariates =================================================
CovNames <- c("Digital Elevation Model", "Aspect", "Slope","Topographic Wetness Index", "Convergence Index" ,"Cross Sectional Curvature", "Longitudinal Curvature", "Channel Network Base Level" ,"Vertical Distance to Channel Network", "Negative Openness","Positive Openness", "Wind Effect (East)","Wind Effect (North-West)","Down-wind Dilution", "Cross-wind Dilution" ,"Corine Land Cover 2006", "Soil Type")
CovAbb <- c("DEM","Aspect","Slope","TWI","ConvInd","CrSectCurv","LongCurv","ChNetBLevel","VDistChNet","NegOp", "PosOp","WEeast","WEnw","DD","CD","clc","SoilType")
#=================== DATA ============================================================
bor.profs <- bor[,c("ID","x","y","Soil.Type","Top","Bottom","As","pH","Co","SOM")]
bor.profs$logSOM <- log(bor.profs$SOM)
bor.profs$logAs <- log(bor.profs$As)
bor.profs$mid <- (bor.profs$Top+bor.profs$Bottom)/2
depths(bor.profs) <- ID ~ Top + Bottom
site(bor.profs) <- ~ Soil.Type + x + y
coordinates(bor.profs) <- ~ x+y
proj4string(bor.profs) <- CRS(utm)
#=====================================================================================
As.fun <- as.formula(paste("As ~", paste(c(CovAbb,"depth"), collapse="+")))
SOM.fun <- as.formula(paste("SOM ~", paste(c(CovAbb[-which(CovAbb %in% c("CD","DD"))],"depth"), collapse="+")))
pH.fun <- as.formula(paste("pH ~", paste(c(CovAbb[-which(CovAbb %in% c("CD","DD"))],"depth"), collapse="+")))
# if you want to run each step of functions, run the line bellow.
#base.model=SOM.fun; profiles=bor.profs; cov.grids=gridmaps.sm2D; poly.deg=1; num.folds=10; num.means=5; interactions=TRUE; hier=FALSE; preproc=TRUE; seed=321
# stratfold3D function
# type: built-in
# Create stratified folds, taking into account 3D position of observation.
# 1. In the first step, observations are clustered according to k-means clustering (k=cent)
# 2. then, in each cluster:
# a. Each profiles are averaged by weigthted mean
# b. profiles were sampled randomly according to number of folds
# 3. Merge each fold with corresponding folds in other clusters
# Input data:
# ==============================
# regdat - input data matrix with "ID", "hdept" and coordinates columns.
# targetVar - name of target variable
# folds - number of folds
# cent - number of centers for k-means clustering
# dimensions - "2D" or "3D"; if dimensions = "3D", k-means clustering takes depth into computation
# IDs (binary) - identify if the list of folds with profiles' IDs must be returned
# sum (binary) - identify if the summary statistics for each fold should be computed (to inspect the folds)
# ==============================
stratfold3d <- function(targetVar, regdat, folds = 5, cent = 3, seed = 666, dimensions = list("2D","3D"), IDs = TRUE, sum = FALSE){
dimensions <- dimensions[[1]]
if(dimensions == "2D"){
unique.df <- ddply(regdat, .(ID), here(summarize), target=weighted.mean(eval(parse(text=targetVar)), hdepth), longitude=longitude[1], latitude=latitude[1])
km <- kmeans(cbind(unique.df$longitude, unique.df$latitude), centers = cent)
unique.df$km <- as.factor(km$cluster)} else {
unique.df <- ddply(regdat, .(ID), here(summarize), target=weighted.mean(eval(parse(text=targetVar)), hdepth), longitude=longitude[1], latitude=latitude[1], altitude=min(altitude))
km <- kmeans(cbind(scale(unique.df$longitude), scale(unique.df$latitude), scale(unique.df$altitude)), centers = cent)
unique.df$km <- as.factor(km$cluster)
}
#============== Creating a list of cluster regions =======================================
k.list <- as.list(rep(NA,length(unique(unique.df$km))))
names(k.list) <- paste("k",c(1:length(k.list)),sep="")
#============== Creating list of n folds per cluster region with profile indexes =========
for(i in 1:length(k.list)){
set.seed(seed)
k.list[[i]] <- createFolds(unique.df[which(unique.df$km==levels(unique.df$km)[i]),"target"],k=folds)
for(j in 1:folds){
k.list[[i]][[j]] <- unique.df[which(unique.df$km == levels(unique.df$km)[i]),"ID"][k.list[[i]][[j]]]
}
}
#=========================================================================================
#============== Creating list of folds with indexes of profiles per each fold ============
ID.list <- as.list(rep(NA,folds))
names(ID.list) <- paste("fold",c(1:folds),sep = "")
for(i in 1:folds){
ID.list[[i]] <- do.call(c,lapply(k.list,function(x) x[[i]]))
names(ID.list[[i]]) <- NULL
}
#=========================================================================================
#================= Creating list of folds with observations' indexes =====================
folds.list <- as.list(rep(NA,folds))
names(folds.list) <- paste("fold",c(1:folds),sep = "")
for(i in 1:length(ID.list)){
folds.list[[i]] <- which(regdat$ID %in% ID.list[[i]])
}
#=========================================================================================
pom <- data.frame()
for(i in 1:length(folds.list)){
allData1 <- regdat[folds.list[[i]],]
allData1$fold <- paste("fold",i,sep="")
allData <- rbind(allData1,pom)
pom <- allData
}
allData$fold<-factor(allData$fold)
sum.list=list(by(allData$depth,allData$fold,summary))
if(IDs==TRUE){index.list=(ID.list)}else{index.list=(folds.list)}
sum.list<-list(allData,index.list,sum.list,by(allData[,paste(targetVar)],allData$fold,summary))
names(sum.list)<-c("Data","folds","depth summary",paste(targetVar,"summary", sep=" "))
if(sum==TRUE){return(sum.list)}else{return(sum.list[[1]])}
}
#=========== pre. sparsereg3D - function for performing whole data preparation for model selection and model assessment =======
#==== Input data are :
# base.model - object of class "formula" in the form of: "target variable~covariates+depth" (without interactions)
# profiles - object of class "SoilProfileCollection" with observations of target variables
# cov.grids - object of class SpatialPixelsDataFrame with covariates
# poly.deg - number defining the degree of polynomial depth function
# cv.folds - number of folds (necessary for data partitioning)
# cent-num.means - number of clusters in k-means clustering (necessary for data partitioning)
# interactions - binary
# preproc - binary
# hier - binary
#
pre.sparsereg3D <- function(base.model, profiles, cov.grids, hier=FALSE, lambda=seq(0,5,0.1), poly.deg=3, num.folds=5,num.means=3,interactions=TRUE,preproc=TRUE,seed=321){
"%ni%" <- Negate("%in%")
if(hier == TRUE){interactions = TRUE}
if(interactions == FALSE){hier = FALSE}
#======= all columns of interest ===============================
target <- all.vars(base.model)[1] # extract name of target variable
## check if all covariates are available:
if(sum(names(cov.grids) %in% all.vars(base.model)[-1])==0){
stop("None of the covariates in the 'formulaString' matches the names in the 'covariates' object")
}
#===============================================================
p4s = proj4string(profiles) # extract spatial reference from profiles
profiles <- join(profiles@horizons,data.frame(data.frame(profiles@site),data.frame(profiles@sp)),type="inner") #creating "profiles" data frame with all variables: "ID","TOp","Bottom","observed variables","x","y".
#============== Names ==========================================
coord.names <- tail(names(profiles),2); #extract names of coordiantes axes
pr.names <- head(names(profiles),3); # extract names of profiles specific variables: "ID","Top","Bottom".
#============== Adding altitude and hdepth =====================
profiles$depth <- - (profiles$Top / 100 + (( profiles$Bottom - profiles$Top ) / 2) / 100) #calculating depth as depth of horizon mid point in "cm"
profiles$hdepth <- profiles$Bottom - profiles$Top # Calculating depth of each horizon in each profile (necessary for data stratification)
#============== Overlay ========================================
profiles <- profiles[complete.cases(profiles[,c("ID",coord.names,"hdepth","depth",target)]),c("ID",target,"hdepth",coord.names,"depth")] #Removing NA values according to: "ID",coord.names,"hdepth","depth",target
coordinates(profiles) <- ~ x + y #creating spatial class (sp) from profiles by defining the spatial coordinates in order to make spatial overlay possible
proj4string(profiles) <- p4s # defining spatial reference
profiles <- spTransform(profiles, proj4string(cov.grids)) # Spatial transform of cov.grids in order to provide the same spatial reference
ov <- over(profiles, cov.grids) # spatial overlay with cov.grids in order to extract the covariate values at sample locations
factor.names <- ov %>% subset(., select=which(sapply(., is.factor))) %>% names() # Extracting the names of categorical variagles
for(i in factor.names){
ov[,i] <- factor(ov[,i]) # removing empty classes from categorical variables
}
#======== Preparing data input matrix with following columns: "ID",target,"hdepth",coord.names, sp.cov.names, "depth"
sp.cov.names <- names(ov[,which(names(ov) %in% c(all.vars(base.model)))]) # extracting the names of all spatial covarites ( which participate in model formula )
profiles <- cbind(as.data.frame(profiles), ov[,sp.cov.names]) # combining profiles with covariates data
profiles <- profiles[complete.cases(profiles[,all.vars(base.model)]),c("ID",target,"hdepth",coord.names, sp.cov.names, "depth")] #removing NA values from input data matrix
#======== Adding polynomial depth terms in input data matrix, only if poly.deg > 1
if(poly.deg > 1){
profiles <- cbind(profiles,poly(profiles$depth,poly.deg,raw=TRUE,simple=TRUE)[,-1])
names(profiles) <- c(names(profiles)[1:(length(names(profiles))-(poly.deg-1))],(paste("depth",c(2:poly.deg),sep="")))
}
#===================== Dummy coding =======================================
dummy.par <- dummyVars(as.formula(paste("~", paste(c(all.vars(base.model))[-1], collapse="+"))),profiles,levelsOnly=FALSE) # extracting and storing dummy coding scheme for transformation of categorical variables
profiles <- cbind(profiles[,which(colnames(profiles) %in% c("ID","hdepth",target,coord.names))], predict(dummy.par, newdata = profiles)) # applying dummy coding scheme on input data matrix (profiles)
colnames(profiles) <- gsub( "\\_|/|\\-|\"|\\s" , "." , colnames(profiles) ) # Replacing special characters from variable names by "."
main.effects <- colnames(profiles)[-which(colnames(profiles) %in% c("ID","hdepth",target,coord.names))] # Extracting names of all variables, including the new variables created from dummy classes.
#==========================================================================
#======= Computing interactions in input data matrix, only if interations == TRUE. =============================================
if (interactions == TRUE){
X <- hierNet::compute.interactions.c(as.matrix(profiles[,-c(which(colnames(profiles) %in% c("ID",target,"hdepth",coord.names)))]),diagonal=FALSE) # Computing all two-pairs interactions between main effects; ("ID",target,"hdepth",coord.names) are excluded from computation
#======= Defining the names of columns of final interaction model, i.e. names of main effects as well as intreactions effects (interactions between spatial covariates and depth terms)
if(poly.deg > 1){ columns.to.keep <- colnames(X[,do.call(c,lapply(strsplit(colnames(X),":"), function(x) x[2] %in% c("depth",paste("depth",c(2:poly.deg),sep="")) & x[1] %ni% c("depth",paste("depth",c(2:poly.deg),sep=""))))]) } else {
columns.to.keep <- (X %>% as.data.frame() %>% subset(., select=grep("depth", names(.), value=TRUE)) %>% colnames())
}
#======= IF hier=TRUE, interactions columns other than interctions with depth become zero columns, while in non-interactions setting these columns are excluded.
if(hier == TRUE) { X[,colnames(X) %ni% columns.to.keep ] <- 0
profiles <- cbind(profiles,X)
}else{
profiles <- cbind(profiles,X[,colnames(X) %in% columns.to.keep])
}
}
#===================== Preprocessing (standardization) of input data =======================================
if(preproc==TRUE){
if(interactions == TRUE) {
# input data matrix %>% selection of columns to be transformed %>% computin preprocessing parameters
st.par <- as.data.frame(profiles) %>% subset(., select = c(main.effects,columns.to.keep)) %>% preProcess(.,method=c("center", "scale"))
profiles <- predict(st.par,newdata = profiles) # Applying preprocessing parameters on input data matrix
}else{
# if interactions = FALSE, only the main effects are transformed
st.par <- as.data.frame(profiles) %>% subset(., select = c(main.effects)) %>% preProcess(.,method=c("center", "scale"))
profiles <- predict(st.par,newdata = profiles)
}
}
#======================== Data stratification ====================================================
profiles <- as.data.frame(profiles) #
profiles <- plyr::rename(profiles, replace=c("x" = "longitude", "y" = "latitude")) #renaming coordinate axes to "longitude" "latitude" (requirement of stratfold3D function)
strat <- stratfold3d(targetVar = target, seed = seed, regdat = profiles, folds = num.folds, cent = num.means, dimensions = "2D", IDs = TRUE, sum = TRUE) # Applying "stratfold3D" function in order to make stratified folds
profile.folds.list <- strat$folds # Extracting list of folds with profiles indexes (with indexes identifying what fold each profile contains)
obs.folds.list <- stratfold3d(targetVar=target, seed=seed,regdat=profiles, folds=num.folds,cent=num.means, dimensions="2D", IDs=FALSE,sum=TRUE)$folds ## Extracting list of folds of observation indexes (with indexes identifying what observation each profile contains)
#=====================================================================================================
# Creating of object for output containing all necessary data for model selection, prediction and model assessment via nested cv.
if(interactions == TRUE){
out <- list(profiles = profiles, cov.grids = cov.grids, model = list(base.model = base.model, target = target, main.effects = main.effects, poly.deg = poly.deg, interactions = interactions ,hier = hier, depth.interactions = columns.to.keep, all.interactions = colnames(X)), num.folds = num.folds, num.means = num.means, preproc = list(dummy.par = dummy.par, st.par = st.par, columns.to.keep = columns.to.keep), folds = list(profile.folds.list = profile.folds.list, obs.folds.list = obs.folds.list, depth.summary = strat[[3]], target.summmary = strat[[4]]))
} else {
out <- list(profiles = profiles, cov.grids = cov.grids, model = list(base.model = base.model, target = target, main.effects = main.effects, poly.deg = poly.deg, interactions = interactions ,hier = hier), num.folds = num.folds, num.means = num.means, preproc = list(dummy.par = dummy.par, st.par = st.par), folds = list(profile.folds.list = profile.folds.list, obs.folds.list = obs.folds.list, depth.summary = strat[[3]], target.summmary = strat[[4]]))
}
return(out)
}
#=========== sparsereg3D.ncv - function for model assessment via nested cv. =======
#==== Input data are :
# sparsereg - output from "pre.sparsereg3D" function
# lambda - vector of lambda values for lasso regression
# seed - random number generator (in order to make computation reproducible)
sparsereg3D.ncv <- function(sparsereg, lambda = seq(0,5,0.1), seed = 321){
#== Extracting data from "sparsereg" object:
flist <- sparsereg$folds$profile.folds.list # list of folds with indexes of profiles
cv.folds <- sparsereg$folds$obs.folds.list # list of folds with indexes of observations
profiles <- sparsereg$profiles # input data
num.folds <- sparsereg$num.folds # number of folds (necessary for data stratification in inner loop in ncv)
num.means <- sparsereg$num.means # number of spatial cluster means (necessary for data stratification in inner loop in ncv)
target <- sparsereg$model$target # name of target variable
target.min <- min(profiles[,target]) # minimum of observed value of target variable (necessary for correcting negative predictions)
interactions = sparsereg$model$interactions # binary parameter (identifying does interactions = TRUE or FALSE)
hier = sparsereg$model$hier # binary parameter (identifying does hier = TRUE or FALSE)
main.effects = sparsereg$model$main.effects # names of main effects
poly.deg = sparsereg$model$poly.deg # degree of polynomial depth function
if(interactions){
columns.to.keep = sparsereg$model$depth.interactions # names of columns to keep (which variables constitute final model)
all.interactions = sparsereg$model$all.interactions # names of all interactions terms
}
# Preparing empty data frames and lists which will contain the results of procedure.
results <- data.frame(lambda = rep(NA,length(flist)+1), RMSE = rep(NA,length(flist)+1), Rsquared = rep(NA,length(flist)+1))
coef.list = as.list(rep(NA,length(flist)))
test.prediction <- data.frame()
train.results <- as.list(rep(NA,length(flist)))
test.results <- as.list(rep(NA,length(flist)))
#==================================================================================
#===== Inner data partitioning ====================================================
for(i in 1:length(flist)){
train.obs.ind <- which(profiles$ID %in% do.call(c, flist[-i]))
train.data <- profiles[train.obs.ind,]
train.profiles.ids <- flist[-i]
test.data <- profiles[-train.obs.ind,]
test.profiles.ids <- flist[i]
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
inner.partitioning <- stratfold3d(target, train.data, folds = num.folds, seed = seed ,cent = num.means, dimensions = "2D", sum = TRUE, IDs = TRUE)$folds
new.folds <- as.list(rep(NA,length(inner.partitioning)))
names(new.folds) <- paste("fold",c(1:length(inner.partitioning)),sep = "")
which.new.fold <- rep(NA,dim(train.data)[1]) # vector of indexes identifying what inner fold each observation is in.
for(j in 1:length(inner.partitioning)){
new.folds[[j]]<-which(train.data$ID %in% inner.partitioning[[j]])
which.new.fold[new.folds[[j]]]<-j
}
#======================================================================================
#============== LASSO training ========================================================
if(!hier){
if(interactions){
train.data <- train.data %>% subset(., select = c(target, main.effects, columns.to.keep))
test.data <- test.data %>% subset(., select = c(target, main.effects, columns.to.keep))
} else {
train.data <- train.data %>% subset(., select = c(target, main.effects))
test.data <- test.data %>% subset(., select = c(target, main.effects))
}
# Lasso
lasso.cv <- cv.glmnet(as.matrix(train.data[,-1]), train.data[,1], alpha = 1,lambda = lambda, foldid = which.new.fold, type.measure = "mse") # lasso traning in inner loop
test.pred <- predict(lasso.cv, s = lasso.cv$lambda.min, newx = as.matrix(test.data[,-1])) # prediction on test data (outer fold)
test.pred <- pmax(test.pred, target.min/3) # correcting zero predictions to the 1/3 of min observation
# Training prediction? ==============================================================================================================
train.pred <- predict(lasso.cv,s = lasso.cv$lambda.min, newx = as.matrix(train.data[,-1])) # prediction on training data
train.pred <- pmax(train.pred,target.min/3) # correcting zero predictions to the 1/3 of min observation
train.pred <- data.frame(predicted = as.numeric(train.pred)) # creating data frame from training prediction
train.per.fold <- data.frame(ID = profiles[train.obs.ind,"ID"], observed = train.data[,1], predicted = train.pred,longitude = profiles[train.obs.ind,"longitude"], latitude = profiles[train.obs.ind,"latitude"], depth = profiles[train.obs.ind,"depth"]) # creating data frame of training predictions with additional columns that pertain to: longitude, latitude, depth etc. (In order to analyze training residuals for further vaiogram modeling)
train.results[[i]] <- train.per.fold # Storing training prediction
#==========================================================================================================================================================
# Test prediction =========================================================================================================================================
## creating data frame of test predictions with additional columns that pertain to: longitude, latitude, depth etc. (In order to analyze test residuals for further vaiogram modeling)
test.per.fold <- data.frame(ID = profiles[-train.obs.ind,"ID"], observed = test.data[,1], predicted = as.numeric(test.pred), longitude = profiles[-train.obs.ind,"longitude"], latitude = profiles[-train.obs.ind,"latitude"], depth = profiles[-train.obs.ind,"depth"])
test.results[[i]] <- test.per.fold
test.obs.pred <- data.frame(obs = test.data[,1], pred = as.numeric(test.pred)) # test obs/pred data frame for each ncv model
coef.list[[i]] <- predict(lasso.cv, type = "coefficients", s = lasso.cv$lambda.min) # list of coefficients for each model in nested cv
fold.measures <- data.frame(lambda = lasso.cv$lambda.min, RMSE = defaultSummary(test.obs.pred)[1], Rsquared = defaultSummary(test.obs.pred)[2]) # Computation of accuracy measures for each step in ncv
results[i,] <- fold.measures # Storing computed accuracy measures for each step in ncv
test.prediction <- rbind(test.prediction,test.obs.pred) # combining all test prediction in one data .frame
#==========================================================================================================================================================
}else{
# Hierarchy setting requires the separation of main effects and interaction effects and also the training and test data.
train.main.effects <- as.matrix(train.data[,main.effects])
test.main.effects <- as.matrix(test.data[,main.effects])
train.int.effects <- as.matrix(train.data[,all.interactions])
test.int.effects <- as.matrix(test.data[,all.interactions])
train.target <- train.data[,target]
test.target <- test.data[,target]
#==================================================================================================================================
hier.path = hierNet.path(train.main.effects, train.target, zz = train.int.effects, diagonal=FALSE, strong=TRUE, trace=0, stand.main = FALSE, stand.int = FALSE) # Fit path of lambda on training data
hier.lasso.cv = hierNet.cv(hier.path, train.main.effects, train.target, folds = new.folds, trace=0) # perform cross-validation on whole path
final.hier.model <- hierNet(train.main.effects, train.target, zz = train.int.effects, diagonal=FALSE, strong=TRUE, lam=hier.path$lamlist[which(hier.lasso.cv$lamhat==hier.path$lamlist)]) # Fit the final model on training data
test.pred <- predict(final.hier.model,newx=test.main.effects,newzz = test.int.effects) # Perform the prediction on test data
test.pred <- pmax(test.pred,target.min/3) # Correcting negative predictions
# Extracting the coefficients of each model in ncv; (ie-interaction effects; me- main effects)
ie <- as.matrix(hier.path$th[,,which(hier.lasso.cv$lamhat==hier.path$lamlist)][,length(main.effects)]) #Extracting the intercation coefficients of final model (in inner loop)
me <- hier.path$bp[,which(hier.lasso.cv$lamhat==hier.path$lamlist), drop = F] - hier.path$bn[,which(hier.lasso.cv$lamhat==hier.path$lamlist), drop = F] #Extracting the main coefficients of final model (in inner loop)
# Training prediction? ==============================================================================================================
train.pred <- predict(final.hier.model,newx=train.main.effects,newzz = train.int.effects)
train.pred <- pmax(train.pred,target.min/3)
train.pred <- data.frame(predicted=as.numeric(train.pred))
train.per.fold <- data.frame(ID = train.data[,"ID"], observed = train.data[,1], predicted = train.pred, longitude = train.data[,"longitude"], latitude = train.data[,"latitude"], depth = profiles[train.obs.ind,"depth"])
train.results[[i]] <- train.per.fold
#====================================================================================================================================
# Test prediction ==============================================================================================================
test.per.fold <- data.frame(ID = test.data[,"ID"], observed = test.data[,1], predicted = as.numeric(test.pred), longitude = test.data[,"longitude"], latitude = test.data[,"latitude"], depth = profiles[-train.obs.ind,"depth"])
test.results[[i]] <- test.per.fold
test.obs.pred <- data.frame(obs=test.target,pred=test.pred) # test obs/pred data frame for each ncv model
coef.list[[i]] <- rbind(ie,me) # matrix of coefficinets for each model in ncv
fold.measures <- data.frame(lambda=hier.lasso.cv$lamhat,RMSE=defaultSummary(test.obs.pred)[1],Rsquared=defaultSummary(test.obs.pred)[2]) # Computation of accuracy measures for each step in ncv
results[i,] <- fold.measures # Storing computed accuracy measures for each step in ncv
test.prediction<-rbind(test.prediction,test.obs.pred) # combining all test prediction in one data .frame
#===============================================================================================================================
}
}
# Storing results ==============================================================================================================================
results[length(flist)+1,] <- c(NA, RMSE = defaultSummary(test.prediction)[1], Rsquared = defaultSummary(test.prediction)[2])
coef.mat <- do.call(cbind, coef.list)
out <- list(accuracy.measures = results, coefficients = coef.mat, test.prediction = test.prediction, folds = cv.folds, train.results = train.results, test.results = test.results)
return(out)
}
#=========== sparsereg3D.pred - function for sparse model selection and prediction =======
#==== Input data are :
# sparsereg - output from "pre.sparsereg3D" function
# lambda - vector of lambda values for lasso regression
# seed - random number generator (in order to make computation reproducible)
# prediction (binary) - if prediction over grid should be performed
# depths - depths defining on which depth prediction has to be made (only if prediction = TRUE)
# chunk - number that defines the number of cells which will be process in one step (in order to speed-up the computation process)
sparsereg3D.pred <- function(sparsereg, lambda = seq(0,5,0.1), prediction = TRUE, seed = 321, depths = c(-.1,-.3), chunk = 1000){
#== Extracting data from "sparsereg" object:
flist = sparsereg$folds$profile.folds.list # list of folds with indexes of profiles
cv.folds = sparsereg$folds$obs.folds.list # list of folds with indexes of observations
preproc.par = sparsereg$preproc # preprocessing paramters (necessary for preparing grids for prediction)
profiles = sparsereg$profiles # input data
cov.grids = cov.grids # prediction grids with covariates
target = sparsereg$model$target # name of target variable
target.min = min(profiles[,target]) # minimum of observed value of target variable (necessary for correcting negative predictions)
interactions = sparsereg$model$interactions # binary parameter (identifying does interactions = TRUE or FALSE)
base.model = sparsereg$model$base.model # base model
hier = sparsereg$model$hier # binary parameter (identifying does hier = TRUE or FALSE)
main.effects = sparsereg$model$main.effects # names of main effects
poly.deg = sparsereg$model$poly.deg # degree of polynomial depth function
if(interactions){
columns.to.keep = sparsereg$model$depth.interactions # names of columns to keep (which variables constitute final model)
all.interactions = sparsereg$model$all.interactions # names of all interactions terms (necessary for "hier" setting)
}
which.folds.list <- as.list(rep(NA,length(flist)))
names(which.folds.list) <- paste("fold",c(1:length(flist)),sep = "")
which.fold <- rep(NA,dim(profiles)[1]) # vector of indexes identifying what fold each observation is in.
for(j in 1:length(flist)){
which.folds.list[[j]] <- which(profiles$ID %in% flist[[j]])
which.fold[which.folds.list[[j]]]<-j
}
#============== LASSO training ========================================================
if(!hier){
if(interactions){
input.data <- profiles %>% subset(., select = c(target, main.effects, columns.to.keep))
}else{
input.data <- profiles %>% subset(., select = c(target, main.effects))
}
# Lasso ===============================================================================================
lasso.cv <- cv.glmnet(as.matrix(input.data[,-1]), input.data[,1], alpha = 1,lambda = lambda, foldid = which.fold, type.measure = "mse") # lasso cv based on data partitioning defined in "which.fold"
lasso.pred <- predict(lasso.cv,s=lasso.cv$lambda.min,newx=as.matrix(input.data[,-1])) # Prediction of final model on whole data set (maybe not necessary)
lasso.pred <- pmax(lasso.pred,target.min/3) # correcting of negative predictions
obs.pred <- data.frame(obs=input.data[,1],pred=as.numeric(lasso.pred)) # obs/pred data frame
coef.list <- predict(lasso.cv,type="coefficients",s=lasso.cv$lambda.min) # coefficients of final model
results <- data.frame(lambda=lasso.cv$lambda.min,RMSE=defaultSummary(obs.pred)[1],Rsquared=defaultSummary(obs.pred)[2]) # Accyracy measures derived from prediction of final model on whole data (maybe not necessary)
}else{
# Hierarchy setting requires the separation of main effects and interaction effects and also the training and test data.
input.data.x <- as.matrix(profiles[,main.effects])
input.data.z <- as.matrix(profiles[,all.interactions])
input.data.y <- (profiles[,target])
hier.path = hierNet.path(input.data.x,input.data.y, zz = input.data.z, diagonal=FALSE, strong=TRUE, trace=0, stand.main = FALSE, stand.int = FALSE) # Fit path of lambda on whole data
hier.lasso.cv = hierNet.cv(hier.path, input.data.x, input.data.y, folds = which.folds.list, trace=0) # perform cross-validation on whole path
hier.lasso.final <- hierNet(input.data.x,input.data.y, zz = input.data.z, diagonal=FALSE, strong=TRUE, lam = hier.path$lamlist[which(hier.lasso.cv$lamhat == hier.path$lamlist)], center = TRUE, stand.main = FALSE, stand.int = FALSE) # Fit the final model on whole data
hier.lasso.pred <- predict(hier.lasso.final, newx = input.data.x, newzz = input.data.z) # Perform the prediction on whole data (not necessary)
hier.lasso.pred <- pmax(hier.lasso.pred,target.min/3) # correcting of negative predictions
# Extracting the coefficients of final model (ie-interaction effects; me- main effects)
if(poly.deg == 1){ ie <- as.matrix(hier.path$th[,,which(hier.lasso.cv$lamhat==hier.path$lamlist)][,length(columns.to.keep)])
} else {
ie <- as.matrix(hier.path$th[,,which(hier.lasso.cv$lamhat==hier.path$lamlist)][,length(columns.to.keep)-poly.deg+1:length(columns.to.keep)])
}
me <- hier.path$bp[,which(hier.lasso.cv$lamhat==hier.path$lamlist), drop = F] - hier.path$bn[,which(hier.lasso.cv$lamhat==hier.path$lamlist), drop = F]
#============================================================================================================================================================
obs.pred <- data.frame(obs=input.data.y, pred=hier.lasso.pred) # obs/pred data frame (not necessary)
coef.list <- data.frame(cov.name=colnames(input.data.x),me,ie) # coefficients
dfresults <- data.frame(lambda = hier.lasso.cv$lamhat, RMSE = defaultSummary(obs.pred)[1], Rsquared = defaultSummary(obs.pred)[2]) # Accyracy measures derived from prediction of final model on whole data (maybe not necessary)
}
# Regression summary list containing the final model, final lambda, coefficients, obs/pred data frames)
if(!hier){
regression.summary <- list(model = list(model = lasso.cv, lambda = lasso.cv$lambda.min), preproc.par = preproc.par, coefficients = coef.list, obs.pred = data.frame(profiles[,1:5], obs.pred))
}else{
regression.summary <- list(model = list(model = hier.lasso.final, lambda = hier.path$lamlist[which(hier.lasso.cv$lamhat == hier.path$lamlist)]), preproc.par = preproc.par, coefficients = coef.list, obs.pred = data.frame(profiles[,1:5], obs.pred))
}
#====== Prediction on grids =============================================================================================================================
if(prediction){
grids.3D <- sp3D(cov.grids, stdepths = depths) # Creating 3D grids (list of grids, each corresponds to different depth)
cores = 1 # defining number of cores (necessary to be 1, because function bellow doesn't work on many cores)
grids.3D <- mclapply(grids.3D, function(x) as.data.frame(x), mc.cores = cores) # Converting grids into data frames
grids.3D <- lapply(grids.3D, function(x) {names(x) <- c(names(x)[1:(length(names(x))-(poly.deg))], c("depth"));return(x)}) # renameing the name of "altitude" column to "depth" column in each grid. (Besause function above uses word "altitude" to denote "depth") )
grids.3D <- mclapply(grids.3D, function(x) subset(x,select=c(all.vars(base.model))[-1],drop=FALSE),mc.cores=cores) %>% mclapply(.,function(x) predict(preproc.par$dummy.par,newdata=x),mc.cores=cores) %>% mclapply(., function(x) {colnames(x) <- gsub( "\\_|/|\\-|\"|\\s" , "." , colnames(x) );return(x)},mc.cores=cores) # Applying dummy.par to transform categorical variables in each grid.
# Adding polynomial depth variables in each grid (only if poly.deg > 1)
if(poly.deg > 1) { grids.3D <- lapply(grids.3D, function(x) x <- cbind(x, poly(x$depth, poly.deg, raw=TRUE, simple=TRUE)[,-1]))
grids.3D <- lapply(grids.3D, function(x) {names(x) <- c(names(x)[1:(length(names(x))-(poly.deg))], c("depth",paste("depth",c(2:poly.deg),sep="")));return(x)})}
#==================== Compute Interactions ==============================================================================
if(interactions){
n <- nrow(grids.3D[[1]]) # defining how many rows each grid contains
r <- rep(1:ceiling(n/chunk),each = chunk)[1:n] # split each grid according to chunk
grids.int.3D <- lapply(grids.3D, function(x) as.data.frame(x)) %>% lapply(., function(x) split(x,r)) # Split each grid in several smaller data frame, according to chunk values
m.cores <- detectCores() # detect the number of cores (in order to split computation of interactions per cores)
registerDoParallel(cores=m.cores) # defining parallel computation process
# Computing interactions for each grid, chunk after chunk, and then combine.
for(i in 1:length(grids.int.3D)){
grids.int.3D[[i]] <- foreach(j = grids.int.3D[[i]],.combine="rbind") %dopar% {hierNet::compute.interactions.c(as.matrix(j),diagonal = FALSE)}
}
}
#======================= Standardization of each grid according to the parameters stored previously (st.par) ============================
for( i in 1:length(grids.int.3D)) {
grids.3D[[i]] <- cbind(grids.3D[[i]],grids.int.3D[[i]])
}
grids.3D <- mclapply(grids.3D,function(x) predict(preproc.par$st.par,newdata=x),mc.cores=cores) %>% mclapply(.,function(x) subset(x[,which(colnames(x) %in% c(main.effects,columns.to.keep))]))
#======================= Columns to keep ============================
# For hierarchical setting interaction effects other than interactions with depth must be setted to zero columns
if(interactions == TRUE){
if(hier == TRUE){
for(i in 1:length(grids.int.3D)) {
grids.3D[[i]] <- subset(grids.3D[[i]][,which(colnames(grids.3D[[i]]) %in% main.effects)])
grids.int.3D[[i]][,colnames(grids.int.3D[[i]]) %ni% columns.to.keep ] <- 0
}
}
}
#====================== FINAL Prediction ==================================
if(!hier){
for(i in 1:length(grids.3D)){
grids.3D[[i]]$pred <- as.numeric(predict(lasso.cv,s=lasso.cv$lambda.min, newx = grids.3D[[i]])) # Applying final model to predict on whole each grid
grids.3D[[i]]$pred <- pmax(grids.3D[[i]]$pred,target.min/3) # Correcting the negative predictions
grids.3D[[i]] <- grids.3D[[i]][,"pred"] # Creating of grids of final predictions
}
} else {
for( i in 1:length(grids.3D)){
grids.3D[[i]]$pred <- as.numeric(predict(hier.lasso.final, newx=grids.3D[[i]], newzz = grids.int.3D[[i]])) # Applying final model to predict on whole each grid
grids.3D[[i]]$pred <- pmax(grids.3D[[i]]$pred,min.obs/3) # Correcting the negative predictions
grids.3D[[i]] <- grids.3D[[i]][,"pred"] # Creating of grids of final predictions
}
}
return(list(prediction=grids.3D, regression.summary = regression.summary))
}
else {return(regression.summary = regression.summary)}
}
pre.som <- pre.sparsereg3D(base.model = SOM.fun, hier = FALSE, profiles = bor.profs, cov.grids = gridmaps.sm2D)
sparsereg = out
pre.som <- pre.sparsereg3D(base.model = SOM.fun, hier = FALSE, profiles=bor.profs, interactions = TRUE, num.folds = 10, cov.grids = gridmaps.sm2D, poly.deg = 3)
sp.reg.som <- sparsereg3D.ncv(pre.sparsereg = pre.som, lambda = seq(0,5,0.1), seed = 321)
sp.reg.som <- sparsereg3D.pred(sparsereg = pre.som, prediction = TRUE ,lambda = seq(0,5,0.1), seed = 321)
sp.reg.som$accuracy.measures
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.