# R/regRSM.r In regRSM: Random Subspace Method (RSM) for Linear Regression

```compute_initial_weights = function(y,x){
# The function returns initial weights.

initial_weights = as.numeric(cor(y,x))^2
initial_weights = initial_weights/(sum(initial_weights))
return(initial_weights)
}

compute_scores = function(y,x,m,B,initial_weights=NULL){
# The function returns RSM scores.

p = ncol(x)
scores = numeric(p)
ns = numeric(p)

for(k in 1:B){
submodel = sample(1:p,size=m,replace=FALSE,prob=initial_weights)
lm1 = lm(y~x[,submodel])
weights = as.numeric((summary(lm1)\$coef[-1,3])^2)
submodel = submodel[which(!is.na(coef(lm1)[-1]))] #UPDATE: deal with collinearity
scores[submodel] =  scores[submodel] + weights
ns[submodel] = ns[submodel] + 1
}
ns = ifelse(ns!=0,ns,1)
scores = scores/ns

return(scores)
}

compute_scores_parallel = function(y,x,m,B,initial_weights,nslaves){
#This function returns RSM scores computed using parallel function foreach %dopar%.
p = ncol(x)
requireNamespace("foreach",quietly = TRUE)
r1 = foreach(i = 1:(nslaves-1), .combine=cbind, .export=c("compute_scores_partial")) %dopar%{

scores_ns
}

r3 = cbind(r1,r2)
scores = apply(r3[1:p,],1,sum)
ns = apply(r3[-(1:p),],1,sum)
ns = ifelse(ns!=0,ns,1)
scores = scores/ns

return(scores)
}

# The function returns RSM scores computed based on taskCount repetitions. Unlike the compute_scores this function returns concatenated vector of scores and ns.

p = ncol(x)
scores = numeric(p)
ns = numeric(p)

submodel = sample(1:p,size=m,replace=FALSE,prob=initial_weights)
lm1 = lm(y~x[,submodel])
weights = as.numeric((summary(lm1)\$coef[-1,3])^2)
submodel = submodel[which(!is.na(coef(lm1)[-1]))] #UPDATE: deal with collinearity
scores[submodel] =  scores[submodel] + weights
ns[submodel] = ns[submodel] + 1
}

return(c(scores,ns))
}

is.wholenumber = function(x, tol = .Machine\$double.eps^0.5){
# The function checks if the given number is whole number.

abs(x - round(x)) < tol
}

select_finalmodel_bic = function(y,x,order1,thrs,penalty){
# The function returns final model using Bayesian Information Criterion.

n = length(y)
lm0 = lm(y~1)
beta0 = as.numeric(coef(lm0))

xo = x[,order1[1:thrs]]
xo = cbind(1,xo)
xo = as.matrix(xo)

#     qr1 = qr(xo)
#     Q = qr.Q(qr1)
#     R = qr.R(qr1)
#     R_inv = solve(R)
#     tQy = t(Q) %*% y

qr1 = qr(xo)
Q = qr.Q(qr1)
R = qr.R(qr1)
small_diag = which(abs(diag(R))<1e-7)
if(length(small_diag)>0){
#Deal with collinearities, coefficients corresponding to linearly dependent columns will be 0.
del = qr1\$pivot[small_diag]
Q1 = Q[,-small_diag]
R1= R[-small_diag,-small_diag]
R1_inv = solve(R1)
tQy1 = t(Q1) %*% y
R_inv = matrix(0,ncol(xo),ncol(xo))
tQy = numeric(ncol(xo))
R_inv[-del,-del]=R1_inv
tQy[-del] = tQy1
warning("Multicollinearity in data. Only coeffients corresponding to linearly independent columns are estimated.")
}else{
R_inv = solve(R)
tQy = t(Q) %*% y
}

betabj = vector("list",thrs)
bic = numeric(thrs)

RSthrs = y-Q %*% tQy
for(j in thrs:2){
}

sel = which.min(bic)
betab = as.numeric(R_inv[1:(sel+1),1:(sel+1)] %*% tQy[1:(sel+1)])
model_sel = order1[1:sel]

if(bic0<bic[sel]){
betab = beta0
model_sel = 0
}

Result = list(model=model_sel,informationCriterion=bic,coefficients=betab)
return(Result)
}

select_finalmodel_qr = function(y,x,yval,xval,order1){
# The function returns final model corresponding to minimal prediction error on validation set.

n = length(y)
p = ncol(x)
x = x[,order1]
xval = xval[,order1]

p1 = ifelse(p<=(n-1),p,n-1)

x = x[,1:p1]
xval = xval[,1:p1]

x=cbind(1,x)
xval=cbind(1,xval)

#     qr1 = qr(x)
#     Q = qr.Q(qr1)
#     R = qr.R(qr1)
#     R_inv = solve(R)
#     tQy = t(Q) %*% y

qr1 = qr(x)
Q = qr.Q(qr1)
R = qr.R(qr1)
small_diag = which(abs(diag(R))<1e-7)
if(length(small_diag)>0){
#Deal with collinearities, coefficients corresponding to linearly dependent columns will be 0.
del = qr1\$pivot[small_diag]
Q1 = Q[,-small_diag]
R1= R[-small_diag,-small_diag]
R1_inv = solve(R1)
tQy1 = t(Q1) %*% y
R_inv = matrix(0,ncol(x),ncol(x))
tQy = numeric(ncol(x))
R_inv[-del,-del]=R1_inv
tQy[-del] = tQy1
warning("Multicollinearity in data. Only coefficients corresponding to linearly independent columns are estimated.")
}else{
R_inv = solve(R)
tQy = t(Q) %*% y
}

betab = vector("list",p1)
betab[[p1]] = R_inv %*% tQy

pred_err = numeric(p1)
xval_R_inv = xval %*% R_inv
pred = xval_R_inv %*% tQy
pred_err[p1] = mean( (yval-pred)^2 )

for(j in p1:2){
betab[[j-1]] = betab[[j]] - as.numeric(tQy[j+1] * R_inv[,(j+1)])
pred = pred - tQy[j+1] * xval_R_inv[,(j+1)]
pred_err[j-1] = mean( (yval - pred)^2 )
}

jsel=which.min(pred_err)
model=order1[1:jsel]

Result = list(model=model,predError=pred_err,coefficients=betab[[jsel]][1:(jsel+1)])
return(Result)
}

new.regRSM <- function()
{

regRSM=list(scores=NULL,model=NULL,time=list(user=0,system=0,elapsed=0),
data_transfer=list(user=0,system=0,elapsed=0),
coefficients=NULL, predError=NULL,input_data=list(x=NULL,y=NULL),
control=list(useGIC=NULL,selval=NULL,screening=NULL,init_weights=FALSE,parallel=NULL,m=NULL,B=NULL))

attr(regRSM,"class")="regRSM"
return(regRSM)
}

#regRSM = function(x,y,yval,xval,m,B, parallel,nslaves,store_data,screening,init_weights,useGIC,thrs,penalty) UseMethod("regRSM")
regRSM = function(x,...) UseMethod("regRSM")

regRSM.default = function(x,y,yval=NULL,xval=NULL,m=NULL,B=NULL, parallel="NO",nslaves=c(4),
store_data=FALSE,screening=NULL,init_weights=FALSE,useGIC=TRUE,thrs=NULL,penalty=NULL,...)
{
data_x = x;
x = as.matrix(x)
y = as.numeric(y)
n = length(y)
p = ncol(x)
scores = NULL

startTime <- proc.time()

#Check for missing values:
complete_cases1 = complete.cases(x)
which_row_na = which(!complete_cases1)
if(length(which_row_na)!=0){
stop("Missing values in rows: ",toString(which_row_na))
}

# Set default values of m and B
if(is.null(m)){
m = floor(min(n-1,p)/2)
}else{
if(m>(n-2)) stop("Parameter m cannot be larger than the number of observations minus two!")
if(m<=0) stop("Parameter m must be a positive number!")
if(!is.wholenumber(m)) stop("Parameter m must be a whole number!")
}
if(is.null(B)){
B = 1000
}else{
if(B<=0) stop("Parameter B must be a positive number!")
if(!is.wholenumber(B)) stop("Parameter B must be a whole number!")
}

#Check for screeneing
if(!is.null(screening))
{
if((screening>=1)||(screening<=0)) stop("screening must be in (0,1)")

iw =  compute_initial_weights(y,x)
sel = which(iw>=quantile(iw,screening))
if(m>length(sel)) stop('Parameter m cannot be larger than the number of attributes remaining after screening procedure!')
x = x[,sel]
}
#Check for initial_weights
if(init_weights){
initial_weights = compute_initial_weights(y,x)
}else{
initial_weights = NULL
}

#RSM method esence
if(parallel=="MPI"){
if("Rmpi" %in% rownames(installed.packages()) == FALSE){
stop("Please install package 'Rmpi' to use MPI parallel version of regRSM")
}
requireNamespace("Rmpi",quietly = TRUE)
create_slaves(nslaves)
d1=proc.time()
send_data(y,x,m,initial_weights)
d2=proc.time()
p = ncol(x)
n = nrow(x)
scores=make_experimentStatic(n,p,B,m,initial_weights)
}else{
if(parallel=="POSIX"){
if("doParallel" %in% rownames(installed.packages()) == FALSE){
stop("Please install package 'doParallel' to use POSIX parallel version of regRSM")
}
requireNamespace("doParallel",quietly = TRUE)
requireNamespace("parallel",quietly = TRUE)
d1=proc.time()
cl = makeCluster(nslaves)
registerDoParallel(cl)
#registerDoParallel(nslaves)
d2=proc.time()
scores = compute_scores_parallel(y,x,m,B,initial_weights,nslaves)
stopCluster(cl)
#stopImplicitCluster()
}else{
d1=d2=proc.time()
scores = compute_scores(y,x,m,B,initial_weights)
}
}

#Set score 0, when variable is not selected by screeneing
if(!is.null(screening)){
scores1 = numeric(ncol(data_x))
scores1[sel] = scores
scores = scores1
}

selval = ifelse(!is.null(yval) && !is.null(xval),TRUE,FALSE)
if(selval) useGIC = FALSE

if(useGIC){
if(is.null(penalty)){
penalty = log(length(y))
}else{
if(penalty<0) stop("Penalty must be positive!")
}
if(is.null(thrs)){
thrs = ifelse(p<=floor(n/2),p,floor(n/2))
}else{
if(thrs>min(p,(n-2))) stop("Parameter thrs cannot be larger than min(p,(n-2))!")
if(thrs<=1) stop("Parameter thrs must be greater than one!")
if(!is.wholenumber(thrs)) stop("Parameter thrs must be a whole number!")
}
order1 = sort(scores,decreasing=TRUE,index.return=TRUE)\$ix
selected_model = select_finalmodel_bic(y,data_x,order1,thrs,penalty)
model = selected_model\$model
coefficients =  as.numeric(selected_model\$coefficients)
predError = NULL
informationCriterion = selected_model\$informationCriterion
}else{
if(selval==TRUE){
order1 = sort(scores,decreasing=TRUE,index.return=TRUE)\$ix
selected_model = select_finalmodel_qr(y,data_x,yval,xval,order1)
model = selected_model\$model
coefficients =  as.numeric(selected_model\$coefficients)
predError = selected_model\$predError
informationCriterion = NULL
}else{
model = NULL
coefficients =  NULL
predError = NULL
informationCriterion = NULL
}
}
stopTime <- proc.time()

regRSM = new.regRSM()
regRSM\$scores = scores
regRSM\$model = model
regRSM\$time = stopTime-startTime
regRSM\$coefficients = coefficients
regRSM\$predError = predError
regRSM\$informationCriterion = informationCriterion
regRSM\$data_transfer = d2-d1
if(store_data) { regRSM\$input_data\$x=data_x; regRSM\$input_data\$y=y }

regRSM\$control\$useGIC = useGIC
regRSM\$control\$selval = selval
regRSM\$control\$screening = screening
regRSM\$control\$init_weights =  init_weights
regRSM\$control\$parallel = parallel
regRSM\$control\$m = m
regRSM\$control\$B = B

print(regRSM)
return(regRSM)
}

regRSM.formula <- function(formula, data=NULL, ...)
{
mf = model.frame(formula,data)
x = model.matrix(attr(mf, "terms"), data=mf)
#Remove column corresponding to the intercept
x = as.matrix(x[,-1])
y = as.numeric(model.response(mf))
est = regRSM(x, y, ...)
cl = match.call()
est\$call = cl
est\$formula = formula
return(est)
}

predict.regRSM = function(object,xnew,...){
if(is.null(object\$coefficients)) stop("The final model is not selected!")
ynew <- as.numeric(cbind(1,xnew[,object\$model]) %*% coef(object))
return(ynew)
}

plot.regRSM = function(x,...){

object = x
if(!is.null(object\$informationCriterion)){
p1 = length(object\$informationCriterion)
plot(1:p1,object\$informationCriterion,type="b",xlab="Variables",ylab="Generalized Information Criterion",main="Generalized Information Criterion",...)
}else{
if(!is.null(object\$predError)){
p1 = length(object\$predError)
plot(1:p1,object\$predError,type="b",xlab="Variables",ylab="Prediction Error",main="Prediction Error on Validation Set",...)
}else{
cat("The final model is not selected! \n")
}
}
}

ImpPlot = function(object) UseMethod("ImpPlot")

ImpPlot.default <- function(object)
stop("No method implemented for this class of object")

ImpPlot.regRSM = function(object){
#The function produces dot plot showing variable importances of the variables.

if (!inherits(object, "regRSM"))
stop("This function only works for objects of class `regRSM'")

dotchart(object\$scores,main="Variable Importance measure",xlab="RSM scores",ylab="Variable number")
}

validate = function(object,yval,xval) UseMethod("validate")

validate.default <- function(object)
stop("No method implemented for this class of object")

validate.regRSM = function(object,yval,xval){
#The function selects the new final model based on the final scores form the original fit.

if(is.null(object\$input_data\$x) || is.null(object\$input_data\$y)) stop('Argument store_data must be TRUE!')

order1 = sort(object\$scores,decreasing=TRUE,index.return=TRUE)\$ix

selected_model = select_finalmodel_qr(object\$input_data\$y,object\$input_data\$x,yval,xval,order1)

model = selected_model\$model
predError = selected_model\$predError
coefficients =  as.numeric(selected_model\$coefficients)

object\$model=model
object\$coefficients=coefficients
object\$predError=predError

return(object)
}

print.regRSM = function(x,...){
#The function prints information about the model.

object = x

cat("\n Model summary: \n\n")

if(object\$control\$useGIC){
cat("Selection method: Generalized Information Criterion \n")
}else{
if(object\$control\$selval==TRUE){
cat("Selection method: Validation set \n")
}else{
cat("Selection method: Final model is not selected! \n")
}
}
if(!is.null(object\$control\$screening)){
cat("Screening: yes \n")
}else{
cat("Screening: no \n")
}

if(object\$control\$init_weights){
cat("Initial weights: yes \n")
}else{
cat("Initial weights: no \n")
}

if(object\$control\$parallel=="MPI" | object\$control\$parallel=="POSIX"){
cat("Version: parallel \n")
}else{
cat("Version: sequential \n")
}

cat("Subspace size:", object\$control\$m, "\n")
cat("Number of simulations:", object\$control\$B, "\n")

if(object\$control\$parallel=="MPI" ){
cat("Remember to use: 'mpi.close.Rslaves()' before terminate the current R session. \n")
}

}

summary.regRSM = function(object,...){
#The function prints information about the model.

cat("\n Model summary: \n\n")

if(object\$control\$useGIC){
cat("Selection method: Generalized Information Criterion \n")
}else{
if(object\$control\$selval==TRUE){
cat("Selection method: Validation set \n")
}else{
cat("Selection method: Final model is not selected! \n")
}
}
if(!is.null(object\$control\$screening)){
cat("Screening: yes \n")
}else{
cat("Screening: no \n")
}

if(object\$control\$init_weights){
cat("Initial weights: yes \n")
}else{
cat("Initial weights: no \n")
}

if(object\$control\$parallel=="MPI" | object\$control\$parallel=="POSIX"){
cat("Version: parallel \n")
}else{
cat("Version: sequential \n")
}

cat("Subspace size:", object\$control\$m, "\n")
cat("Number of simulations:", object\$control\$B, "\n")

if(object\$control\$parallel=="MPI" ){
cat("Remember to use: 'mpi.close.Rslaves()' before terminate the current R session. \n")
}

}

roc = function(object,truemodel,plotit,...) UseMethod("roc")

roc.default <- function(object)
stop("No method implemented for this class of object")

roc.regRSM = function(object,truemodel,plotit=TRUE,...){
# The function produces ROC curve for ordering and computes AUC.

ordering =  sort(object\$scores,decreasing=TRUE,index.return=TRUE)\$ix

if(length(setdiff(truemodel,ordering))>=1){
stop("Argument truemodel must be a subset of ordering!")
}

p = length(ordering)
npos = length(truemodel)
nneg = p - npos

tpr = numeric(p+1)
fpr = numeric(p+1)

tpr[1] = 0
fpr[1] = 0

for(j in 2:(p+1)){
if(ordering[j-1] %in% truemodel){
tpr[j] = tpr[j-1] + 1
fpr[j] = fpr[j-1]
}else{
tpr[j] = tpr[j-1]
fpr[j] = fpr[j-1] + 1
}

}
tpr = tpr/npos
fpr = fpr/nneg

counts = as.numeric(summary(as.factor(tpr[tpr!=0])))
xs = (counts-1)*(1/nneg)
values = unique(tpr[tpr!=0])
auc = sum( xs * values )

if(plotit){
plot(fpr,tpr,type="b",main="ROC curve for RSM ordering")
legend("bottomright",paste("AUC= ",round(auc,4)))
}else{
return(auc)
}

}
```

## Try the regRSM package in your browser

Any scripts or data that you put into this service are public.

regRSM documentation built on May 2, 2019, 7:01 a.m.