Nothing
############################################
# Needed packages
#library(parallel)
#library(gtools)
#library(boot)
############################################
#R2 estimation
estim.R2<-function(dataX,y, subset, logistic, any.cat, max.iter){
# Function estim.R2:
# Computes the R2 (explained variance) statistic for linear/logistic
# regression for a specified sub-model (subset of inputs)
# Inputs:
# dataX: data.frame/matrix
# y: num/factor/logical vector
# subset: integer vector, subset of inputs for which to compute the R2
# logistic: logical. If TRUE, then logistic regression is performed
# any.cat: logical. If TRUE, then at least one column of dataX is a factor.
# Output:
# R2: num. R2 value for the desired model (linear/logistic) and for the
# specified subset of inputs.
if(!logistic){
#######################
#Linear Model
if(!any.cat){
################################################################
# Computing by hand with scalar inputs: approx 7x faster than lm
#Subsetting X
X_<-as.matrix(dataX[,as.vector(subset)])
#Adding beta_0
X_<-cbind(rep(1, nrow(X_)), X_)
#Computing beta estimates
estim.beta<-solve(t(X_)%*%X_)%*%t(X_)%*%y
#Computing R2 estimate
R2<-1-(var(y-X_%*%estim.beta)/var(y))
}else{
#################################################################
# Using lm method if there is any categorical inputs
#Subsetting X
X_<-dataX[,as.vector(subset)]
dat<-data.frame(y,X_)
#Computing R2 estimate
R2<-summary(lm(y~., data=dat))$r.squared
}
}else{
#######################
#Logistic Model
#Maximum optimization iterations
fit.control=glm.control(maxit=max.iter)
#Subsetting X
X_<-dataX[,as.vector(subset)]
dat<-data.frame(y,X_)
sum_glm<-summary(glm(y~., data=dat,
family="binomial",
control=fit.control))
#Computing Logistic R2 estimate
R2<-1-(sum_glm$deviance/sum_glm$null.deviance)
}
return(R2)
}
######################################
#Recursive PMVD computation
calc.pmvd.rec.boot<-function(X, y, indices, logistic, max.iter,tol, any.cat, i=1:nrow(X)){
# Function calc.pmvd.rec.boot
# Computes the PMVD indices using the recursive algorithm of Feldman (2005)
# for bootstrap confidence interval computations.
# Inputs:
# X: matrix of data frame.
# y: num/factor/logical vector
# d: int. Covariates dimension.
# indices: list. List of subset indices.
# logistic: logical. If TRUE, logistic regression is performed, else, linear
# model instead.
# any.cat: logical. If TRUE, then at least one column of dataX is a factor.
# i: integer vector. Used for bootstrap estimation.
# Output:
# res: (1xd) matrix. PMVD value for all covariates.
#################################
# Pre-Processing
#Dimension
d=ncol(X)
#Bootstrap row selection
X<-X[i,]
y<-matrix(y,ncol=1)[i,]
#Initiating the conditional elements object
R2s<-rep(list(0), d+1)
#################################
# Conditional elements estimation
for(j in 1:d){
#############################
#R2 estimation
R2s[[j+1]]<-apply(indices[[j+1]], 2,estim.R2,
dataX=X,
y=y,
logistic=logistic,
any.cat=any.cat,
max.iter=max.iter)
}
#################################
# PMVD indices computation
#Preparing the recursive computation of the PMVD indices
#Full model indices
N=1:d
#Initiating P(S)
Ps<-rep(list(0), d+1)
#Computing w({i}) forall i
Ps[[1]]=rev(R2s[[d+1]] - R2s[[d]])
#Finding if any w({i})=0 or |w({i})|<tol
if(is.null(tol)){
logi.zeros<-Ps[[1]]==0
}else{
logi.zeros<-abs(Ps[[1]])<tol
}
if(any(logi.zeros)){
var_null<-which(logi.zeros)
nb_null<-length(var_null)
idx_ind_null=rep(list(0), d-nb_null+1)
indices_<-rep(list(0), d-nb_null)
for(i in 1:(d-nb_null)){
checkmat<-matrix(is.element(indices[[i+1]], var_null), i, ncol(indices[[i+1]]))
idx_ind_null[[i]] <- as.vector(which(colSums(checkmat) == 0))
indices_[[i]]<-matrix(indices[[i+1]][, idx_ind_null[[i]]],ncol=length(idx_ind_null[[i]]))
}
p=d #Original number of dimensions
d=d-nb_null #Number of dimensions after removing spurious covariates
Ps[[1]]<-Ps[[1]][setdiff(N, var_null)] #Only keeping w({i}) for non-spurious covariates
}else{
p=d
nb_null=0
}
#Value of R2(full_model) to avoid a lot of search
R2_comp<-R2s[[p+1]]
for(ord in 2:d){
#For every input orders : we start at 2 because Ps[[1]] has already been computed
if(ord==d){
#If this is the last order
Ps[[ord]]=R2_comp*(1/sum(1/Ps[[ord-1]]))
}else{
if(any(logi.zeros)){
#########################
#If any spurious variables
#Computing w(S) for all S of order ord
Ws<-rev(R2_comp - R2s[[d+1-ord]][idx_ind_null[[d-ord]]])
nbset<-ncol(indices_[[ord]]) #Number of subsets S
Ps[[ord]]<-rep(0, nbset) #Setting up the Ps results
for(i in 1:nbset){
S<-c(indices_[[ord]][,i]) #For every possible subset S of order ord
idx_Spi=which(colSums(matrix(indices_[[ord-1]]%in%S, nrow=length(S)-1, ncol=ncol(indices_[[ord-1]])))==length(S)-1) #Find every S\{i} for all i
Ps[[ord]][i]=Ws[i]/sum(1/(Ps[[ord-1]][idx_Spi])) #Recursively compute Ps
}
}else{
########################
#If no spurious variables
#Computing w(S) for all S of order ord
Ws<-rev(R2_comp - R2s[[d+1-ord]])
nbset<-ncol(indices[[ord+1]])#Number of subsets S
Ps[[ord]]<-rep(0, nbset)#Setting up the Ps results
for(i in 1:nbset){
S<-c(indices[[ord+1]][,i]) #For every possible subset S of order ord
idx_Spi=which(colSums(matrix(indices[[ord]]%in%S, nrow=length(S)-1, ncol=ncol(indices[[ord]])))==length(S)-1)#Find every S\{i} for all i
Ps[[ord]][i]=Ws[i]/sum(1/(Ps[[ord-1]][idx_Spi]))#Recursively compute Ps
}
}
}
}
pmd<-matrix(rev(Ps[[d]]/Ps[[d-1]]), nrow=1)
if(any(logi.zeros)){
#If spurious variable, we set their value to 0
index <- 1
pmd_ <- rep(0, p)
for (i in 1:p) {
if (!is.element(i, var_null)) {
pmd_[i] <- pmd[index]
index <- index + 1
}
}
pmd=matrix(pmd_, nrow=1)
}
colnames(pmd)=colnames(X)
return(pmd)
}
############################################
# Main function
pmvd <- function(X, y, logistic = FALSE, tol=NULL, rank=FALSE, nboot = 0, conf=0.95, max.iter=1000, parl=NULL){
###########################################
#Pre-processing
#Retrieving dimension
d<-ncol(X)
#Checking for categorical inputs
any.cat<-any(sapply(X,class)=="factor")
##########################################
#Arguments check
#X
if(!(class(X)[1]=="matrix" | class(X)[1]=="data.frame")){
stop("X must be either a matrix of a data frame.")
}
#y
if (logistic==F & !is.numeric(y)){
stop(paste("y must be a numeric vector of length ",nrow(X)," or a matrix with ", nrow(X), " rows and one column.", sep=""))
}else if(logistic==T & (all(!is.logical(y), !is.factor(y), !is.numeric(y))) ){
stop("y must be a logical, factor or numeric vector.")
}else if (length(y)!=nrow(X)){
stop(paste("y must be a vector of length ",nrow(X)," or a matrix with ", nrow(X), " rows and one column.", sep=""))
}
#logistic
if(!is.logical(logistic)){
stop("The 'logistic' argument must be logical, either TRUE or FALSE.")
}
#rank
if(!is.logical(rank)){
stop("The 'rank' argument must be logical, either TRUE or FALSE.")
}else if (logistic & rank){
rank=F
warning("Impossible to perform a logistic regression with a rank transformation. Defaulted to rank=FALSE.")
}else if (any.cat & rank){
rank=F
warning("Impossible to perform a rank transformation with categorical inputs. Defaulted to rank=FALSE.")
}
#nboot
if(!is.numeric(nboot)){
stop("The 'nboot' argument must be a positive integer.")
}else if (nboot%%1!=0 | nboot<0 | length(nboot)!=1){
stop("The 'nboot' argument must be a positive integer.")
}else if(nboot==0){
boot=F
}else{
boot=T
}
#parl
if(!is.numeric(parl)){
if(!is.null(parl)){
stop("The 'parl' argument must be NULL or a positive integer.")
}
}else if(parl%%1!=0 | parl<0 | length(parl)!=1){
stop("The 'parl' argument must be NULL or a positive integer.")
}else if (parl>parallel::detectCores()){
parl=parallel::detectCores()-1
warning(paste("Too many cores specified. Defaulted to ", parl, " cores.", sep=""))
}
########################################
#Data Processing
#Rank Transformation
if(rank){
X<-apply(X, 2, rank)
}
#List of subsets by order ([[1]]: subsets of order 1...etc)
indices<-rep(list(0), d+1)
#List of R2s, arranged by orders (same structure as the indices object)
R2s<-rep(list(0), d+1)
#List of elements for recursive computing
#Computing subsets
for(j in 1:d){
indices[[j+1]]<-t(gtools::combinations(n=d, r=j))
}
#####################################################
# PMVD Computing
#Preparing parallel cluster
if(!is.null(parl)){
cl=parallel::makeCluster(parl)
}
if(!boot){
#################################
# Non-Bootstrap computation
############################################
#Conditional elements computing
#For every order, the corresponding R2 estimate of each
#subset of indices are stored in R2s
for(j in 1:d){
if(!is.null(parl)){
#############################
#Parallelized R2 estimation
R2s[[j+1]]<-parallel::parApply(cl,indices[[j+1]], 2,estim.R2,
dataX=X,
y=y,
logistic=logistic,
any.cat=any.cat,
max.iter=max.iter)
}else{
#############################
#R2 estimation
R2s[[j+1]]<-apply(indices[[j+1]], 2,estim.R2,
dataX=X,
y=y,
logistic=logistic,
any.cat=any.cat,
max.iter=max.iter)
}
}
#################################
# PMVD indices computation
#Preparing the recursive computation of the PMVD indices
#Full model indices
N=1:d
#Initiating P(S)
Ps<-rep(list(0), d+1)
#Computing w({i}) forall i
Ps[[1]]=rev(R2s[[d+1]] - R2s[[d]])
#Finding if any w({i})=0 or |w({i})|<tol
if(is.null(tol)){
logi.zeros<-Ps[[1]]==0
}else{
logi.zeros<-abs(Ps[[1]])<tol
}
if(any(logi.zeros)){
var_null<-which(logi.zeros)
nb_null<-length(var_null)
idx_ind_null=rep(list(0), d-nb_null+1)
indices_<-rep(list(0), d-nb_null)
for(i in 1:(d-nb_null)){
checkmat<-matrix(is.element(indices[[i+1]], var_null), i, ncol(indices[[i+1]]))
idx_ind_null[[i]] <- as.vector(which(colSums(checkmat) == 0))
indices_[[i]]<-matrix(indices[[i+1]][, idx_ind_null[[i]]],ncol=length(idx_ind_null[[i]]))
}
p=d #Original number of dimensions
d=d-nb_null #Number of dimensions after removing spurious covariates
Ps[[1]]<-Ps[[1]][setdiff(N, var_null)] #Only keeping w({i}) for non-spurious covariates
}else{
p=d
nb_null=0
}
#Value of R2(full_model) to avoid a lot of search
R2_comp<-R2s[[p+1]]
if(!is.null(parl)){doParallel::registerDoParallel(cl)}
for(ord in 2:d){
#For every input orders : we start at 2 because Ps[[1]] has already been computed
if(ord==d){
#If this is the last order
Ps[[ord]]=R2_comp*(1/sum(1/Ps[[ord-1]]))
}else{
if(any(logi.zeros)){
#########################
#If any spurious variables
#Computing w(S) for all S of order ord
Ws<-rev(R2_comp - R2s[[d+1-ord]][idx_ind_null[[d-ord]]])
nbset<-ncol(indices_[[ord]]) #Number of subsets S
if(!is.null(parl)){
Ps[[ord]]<-foreach::foreach(i=1:nbset, .combine=cbind)%dopar%{
S<-c(indices_[[ord]][,i])
idx_Spi=which(colSums(matrix(indices_[[ord-1]]%in%S, nrow=length(S)-1, ncol=ncol(indices_[[ord-1]])))==length(S)-1)
res=Ws[i]/sum(1/(Ps[[ord-1]][idx_Spi]))
res
}
}else{
Ps[[ord]]<-rep(0, nbset) #Setting up the Ps results
for(i in 1:nbset){
S<-c(indices_[[ord]][,i]) #For every possible subset S of order ord
idx_Spi=which(colSums(matrix(indices_[[ord-1]]%in%S, nrow=length(S)-1, ncol=ncol(indices_[[ord-1]])))==length(S)-1) #Find every S\{i} for all i
Ps[[ord]][i]=Ws[i]/sum(1/(Ps[[ord-1]][idx_Spi])) #Recursively compute Ps
}
}
}else{
########################
#If no spurious variables
#Computing w(S) for all S of order ord
Ws<-rev(R2_comp - R2s[[d+1-ord]])
nbset<-ncol(indices[[ord+1]])#Number of subsets S
if(!is.null(parl)){
Ps[[ord]]<-foreach::foreach(i=1:nbset, .combine=cbind)%dopar%{
S<-c(indices[[ord+1]][,i])
idx_Spi=which(colSums(matrix(indices[[ord]]%in%S, nrow=length(S)-1, ncol=ncol(indices[[ord]])))==length(S)-1)
res=Ws[i]/sum(1/(Ps[[ord-1]][idx_Spi]))
res
}
}else{
Ps[[ord]]<-rep(0, nbset)#Setting up the Ps results
for(i in 1:nbset){
S<-c(indices[[ord+1]][,i]) #For every possible subset S of order ord
idx_Spi=which(colSums(matrix(indices[[ord]]%in%S, nrow=length(S)-1, ncol=ncol(indices[[ord]])))==length(S)-1)#Find every S\{i} for all i
Ps[[ord]][i]=Ws[i]/sum(1/(Ps[[ord-1]][idx_Spi]))#Recursively compute Ps
}
}
}
}
}
pmvd<-matrix(rev(Ps[[d]]/Ps[[d-1]]), ncol=1)
if(any(logi.zeros)){
#If spurious variable, we set their value to 0
index <- 1
pmvd_ <- rep(0, p)
for (i in 1:p) {
if (!is.element(i, var_null)) {
pmvd_[i] <- pmvd[index]
index <- index + 1
}
}
pmvd=matrix(pmvd_, ncol=1)
}
rownames(pmvd)=colnames(X)
colnames(pmvd)=c("original")
#Preparing the output.
out<-list("call"=match.call(),
"pmvd"=pmvd,
"R2s"=R2s,
"indices"=indices,
"P"=Ps,
"conf_int"=NULL,
"X"=X,
"y"=y,
"logistic"=logistic,
"boot"=boot,
"nboot"=nboot,
"rank"=rank,
"parl"=parl,
"conf"=conf)
}else{
##################################
# Bootstrap computation
############################################
#Conditional elements computing
#For every order, the corresponding R2 estimate of each
#subset of indices are stored in R2s
for(j in 1:d){
if(!is.null(parl)){
#############################
#Parallelized R2 estimation
R2s[[j+1]]<-parallel::parApply(cl,indices[[j+1]], 2,estim.R2,
dataX=X,
y=y,
logistic=logistic,
any.cat=any.cat,
max.iter=max.iter)
}else{
#############################
#R2 estimation
R2s[[j+1]]<-apply(indices[[j+1]], 2,estim.R2,
dataX=X,
y=y,
logistic=logistic,
any.cat=any.cat,
max.iter=max.iter)
}
}
if(is.null(parl)){
#Bootstrap confidence interval estimation
boot_pmvd<-boot::boot(data=cbind(X),
statistic=calc.pmvd.rec.boot,
R=nboot,
# d=d,
y=y,
logistic=logistic,
indices=indices,
any.cat=any.cat,
tol=tol,
max.iter=max.iter,
stype="i")
}else{
#Parallelized Bootstrap confidence interval estimation
boot_pmvd<-boot::boot(data=cbind(X),
statistic=calc.pmvd.rec.boot,
R=nboot,
# d=d,
y=y,
logistic=logistic,
indices=indices,
any.cat=any.cat,
stype="i",
parallel="snow",
ncpus=parl,
tol=tol,
max.iter=max.iter,
cl=cl)
}
res.boot<-bootstats(boot_pmvd, conf, "basic")
rownames(res.boot)<-colnames(X)
pmvd<-matrix(res.boot[,1], ncol=1)
colnames(pmvd)=c("original")
rownames(pmvd)=colnames(X)
out<-list("call"=match.call(),
"pmvd"=pmvd,
"R2s"=R2s,
"indices"=indices,
"P"=NULL,
"conf_int"=res.boot,
"X"=X,
"y"=y,
"logistic"=logistic,
"boot"=boot,
"nboot"=nboot,
"rank"=rank,
"parl"=parl,
"conf"=conf)
}
#Stopping the cluster after parallelized computation
if(!is.null(parl)){parallel::stopCluster(cl)}
class(out)<-"pmvd"
return(out)
}
#######################################
# Custom methods
print.pmvd<-function(x, ...){
cat("\nCall:\n", deparse(x$call), "\n", sep = "")
if(x$logistic){
cat("\nPMVD decomposition of R2 for logistic model\n")
}else{
cat("\nPMVD decomposition of R2 for linear model\n")
}
if(x$boot){
print(x$conf_int)
}else{
print(x$pmvd)
}
}
plot.pmvd<-function(x, ylim=c(0,1), ...){
if(x$logistic){
plot_title="PMVD decomposition of R2 for logistic model"
}else{
plot_title="PMVD decomposition of R2 for linear model"
}
plot(x$pmvd,
ylim=ylim,
axes=F,
xlab="Inputs",
ylab="PMVD",
main=plot_title,
...)
axis(2)
axis(1, at=seq_along(x$pmvd), labels=colnames(x$X))
box()
graphics::grid()
if(x$boot){
segments(x0=1:ncol(x$X),
y0=x$conf_int$`min. c.i.`,
x1=1:ncol(x$X),
y1=x$conf_int$`max. c.i.`)
legend("topright",
pch=c(1,NA),
lty=c(NA,1),
legend=c("PMVD values",
paste(x$conf*100, "% Confidence interval", sep="")),
bg="white")
}else{
legend("topright",
pch=1,
legend=c("PMVD Values"),
bg="white")
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.