## ILAA is a wrapper of IDeA that computes a linear transformation matrix and returns the transformed data set.
# The transformation matrix is an attribute of the returned data frame
# The main motivation behind ILAA is to simplify IDeA calling and to return robust transformation based on small data perturbations.
# ### ILAA Function Interface
# The `ILAA` function is designed to process and analyze a dataset (`data`) using specified thresholds and methods. It allows for bootstrapping to enhance the #robustness of the analysis and produces various metrics and adjustments based on the input parameters. Below is a detailed description of each parameter and #the function's overall operation:
# #### Parameters:
# -data: `data.frame` (default: `NULL`)
# - The input dataset to be analyzed. This should be a data frame containing the variables of interest.
# -thr: `numeric` (default: `0.80`)
# - Threshold value used in the analysis. This typically represents a correlation threshold or a cutoff value for some statistical measure.
# -method: `character` (default: `c("pearson", "spearman")`)
# - The correlation method to be used in the analysis. Options are "pearson" for Pearson correlation and "spearman" for Spearman rank correlation.
# -Outcome: `character` or `NULL` (default: `NULL`)
# - The outcome variable in the dataset. If provided, the analysis will focus on this outcome.
# -drivingFeatures: `character` vector or `NULL` (default: `NULL`)
# - Specific features in the dataset that are considered as driving factors for the analysis. If provided, these features will be given special consideration in the analysis.
# -maxLoops: `integer` (default: `100`)
# - The maximum number of loops or iterations to be performed in the analysis process.
# -verbose: `logical` (default: `FALSE`)
# - If `TRUE`, the function will print detailed output during execution, which is useful for debugging or understanding the analysis steps.
# -bootstrap: `integer` (default: `0`)
# - The number of bootstrap samples to be used. If `bootstrap` is greater than 1, the function will perform bootstrapping to enhance the robustness of the results.
# #### Operation:
# 1. Method Selection:
# - The function starts by selecting the correlation method based on the input parameter (`method`). It sets the `type` to "LM" for Pearson and "RLM" for Spearman correlations.
# 2. Initial IDeA Call:
# - The `IDeA` function is called with the specified parameters to perform the initial analysis and transformation of the data.
# 3. Bootstrapping (if applicable):
# - If `bootstrap` is greater than 1, the function initializes matrices and vectors to store results from multiple bootstrap samples.
# - It performs bootstrapping by resampling the data and calling the `IDeA` function repeatedly, adjusting and accumulating the results across samples.
# 4. Result Aggregation:
# - The function aggregates the results from the bootstrapping process, normalizing the accumulated transformation matrices and scores.
# - It identifies latent variables and adjusts the transformation matrices accordingly.
# 5. Final Adjustments:
# - The final transformation matrix and other attributes are updated based on the aggregated results.
# - The function applies decorrelation and calculates variance ratios for the transformed and original data.
# 6. Return:
# - The function returns the final transformation results (`transf`) with updated attributes, including transformation matrices, scores, latent variables, and variance ratios.
# ---
ILAA <- function(data=NULL,
thr=0.80,
method=c("pearson","spearman"),
Outcome=NULL,
drivingFeatures=NULL,
maxLoops=100,
verbose=FALSE,
bootstrap=0)
{
# Select method (either "pearson" or "spearman") and set type accordingly
method <- match.arg(method);
type="LM";
if (method=="pearson")
{
method <- "fast";
}
if (method=="spearman")
{
type="RLM";
}
if (verbose)
{
cat(method,"|",type,"|")
}
## First IDeA Call
transf <- IDeA(data=data,
thr=thr,
method=method,
Outcome=Outcome,
drivingFeatures=drivingFeatures,
maxLoops=maxLoops,
verbose=verbose,
type=type)
if (bootstrap > 1)
{
# Initialize various matrices and vectors for bootstrap
transform <- attr(transf,"UPLTM")
ocalnames <- colnames(transform)
fscore <- attr(transf,"fscore");
countf <- attr(transf,"TotalAdjustments");
AdrivingFeatures <- attr(transf,"drivingFeatures");
bfeat<- attr(transf,"unaltered");
adjunipvalue <- attr(transf,"unipvalue");
rcrit <- attr(transf,"R.critical");
IDeAEvolution <- attr(transf,"IDeAEvolution");
useDeCorr <- attr(transf,"useDeCorr")
dsize <- nrow(data);
lavariables <- names(fscore);
lavariables <- str_remove_all(lavariables,"La_");
names(fscore) <- lavariables
taccmatrix <- diag(length(fscore));
colnames(taccmatrix) <- lavariables
rownames(taccmatrix) <- lavariables
twts <- rep(1,nrow(taccmatrix));
names(twts) <- lavariables
colnames(transform) <- str_remove_all(ocalnames,"La_");
taccmatrix[rownames(transform),colnames(transform)] <- transform
sgnmatrix <- 1*(taccmatrix > 0) - 1*(taccmatrix < 0)
tmptransform <- sgnmatrix
if (verbose)
{
cat("bootstrapping \n")
}
for (lp in c(1:bootstrap))
{
if (verbose)
{
cat(".")
if (lp %% 40 == 0) cat("\n")
}
if (bootstrap > 500) ## True bootstrap
{
smp <- sample(dsize,dsize,replace = TRUE);
}
else ## small bootstrap 5% will be duplicated
{
smp <- c(sample(dsize,0.95*dsize),sample(dsize,0.05*dsize));
}
## Calling IDeA inside the Loop
transf <- IDeA(data[smp,],
thr=thr,
method=method,
Outcome=Outcome,
drivingFeatures=AdrivingFeatures,
maxLoops=maxLoops,
type=type)
## Norw we will aggregate the ILAA transformations
transform <- attr(transf,"UPLTM")
bcolnames <- colnames(transform)
colnames(transform) <- str_remove_all(bcolnames,"La_")
rnames <- rownames(transform)
cnames <- rnames
# wo for weighed aggregation
wo <- 1.0;
tmptransform <- sgnmatrix
tmptransform[rnames,cnames] <- 1*(transform > 0) - 1*(transform < 0)
sgnmatrix[sgnmatrix == 0] <- tmptransform[sgnmatrix == 0]
sgchange <- apply(1*(sgnmatrix*tmptransform < 0),2,sum)
if ((sum(!(bcolnames %in% ocalnames)) > 0) | (sum(sgchange) > 0))
{
if (verbose)
{
cat("{",length(bcolnames[!(bcolnames %in% ocalnames)]),",",sum(sgchange),"}")
}
wo <- 0.1;
cnames <- bcolnames[bcolnames %in% ocalnames];
cnames <- str_remove_all(cnames,"La_")
cnames <- cnames[cnames %in% names(sgchange)[sgchange == 0]]
}
## Estimating the weights for the aggregation. It is an ad hoc weights
mcor <- min(attr(transf,"IDeAEvolution")$Corr)
cwt <- (mcor - thr)/(1.0 - thr);
if (cwt < 0) cwt <- 0
wt <- wo*(1.0 - cwt)^2
## Weighted aggregation.
taccmatrix[rnames,cnames] <- taccmatrix[rnames,cnames] + wt*transform[rnames,cnames]
bscore <- attr(transf,"fscore")
names(bscore) <- str_remove_all(names(bscore),"La_");
fscore[cnames] <- fscore[cnames] + wt*bscore[cnames];
twts[cnames] <- twts[cnames] + wt
if (verbose)
{
cat(sprintf("(r=%3.2f,w=%3.2f)",mcor,wt));
}
}
## Updated the transformation and the discovered latent variables
tmptransform <- NULL
sgnmatrix <- NULL
transform <- NULL;
for (cn in colnames(taccmatrix))
{
taccmatrix[,cn] <- taccmatrix[,cn]/twts[cn];
}
fscore <- fscore/twts;
islatent <- apply(1*(taccmatrix != 0),2,sum) > 1;
ctnames <- colnames(taccmatrix)
lavariables <- ctnames[islatent]
ctnames[islatent] <- paste("La_",ctnames[islatent],sep="")
snames <- names(fscore)
snames[snames %in% lavariables] <- paste("La_",snames[snames %in% lavariables],sep="")
names(fscore) <- snames
colnames(taccmatrix) <- ctnames
tokeep <- !((apply(1*(taccmatrix != 0),2,sum) == 1) & (apply(1*(taccmatrix != 0),1,sum) == 1))
taccmatrix <- taccmatrix[tokeep,tokeep]
attr(transf,"UPLTM") <- taccmatrix
attr(transf,"LatentVariables") <- lavariables;
## Transform the input dataframe
transf <- predictDecorrelate(transf,data)
VarTrans <- apply(transf[,colnames(taccmatrix)],2,var);
VarObs <- apply(data[,rownames(taccmatrix)],2,var);
VarRatio <- VarTrans/VarObs
VarRatio <- VarRatio[order(-VarRatio)]
noaltered <- colnames(data)[!(colnames(data) %in% rownames(taccmatrix))]
if (length(noaltered)>0)
{
varone <- rep(1,length(noaltered))
names(varone) <- noaltered
VarRatio <- c(varone,VarRatio)
}
## Setting the attributes of the returned dataframe
attr(transf,"UPLTM") <- taccmatrix
attr(transf,"fscore") <- fscore;
attr(transf,"TotalAdjustments") <- countf;
attr(transf,"drivingFeatures") <- AdrivingFeatures;
attr(transf,"unaltered") <- bfeat;
attr(transf,"LatentVariables") <- lavariables;
attr(transf,"useDeCorr") <- useDeCorr;
attr(transf,"unipvalue") <- adjunipvalue
attr(transf,"R.critical") <- rcrit
attr(transf,"IDeAEvolution") <- IDeAEvolution
attr(transf,"VarRatio") <- VarRatio
if (verbose)
{
cat("\n\r")
print(head(VarRatio))
}
}
return(transf)
}
## IDeA Computes the linear transformation that mitigates feature collinearity. The transformation is then used to transform the input database.
IDeA <- function(data=NULL,
thr=0.80,
method=c("fast","pearson","spearman","kendall"),
Outcome=NULL,
refdata=NULL,
drivingFeatures=NULL,
useDeCorr=TRUE,
relaxed=TRUE,
corRank=TRUE,
maxLoops=100,
unipvalue=0.05,
verbose=FALSE,
...)
{
# Ensure 'Rfast' package is installed
if (!requireNamespace("Rfast", quietly = TRUE)) {
install.packages("Rfast", dependencies = TRUE)
}
method <- match.arg(method);
useFastCor <- method=="fast";
if (useFastCor)
{
useDeCorr <- TRUE;
method <- "pearson";
}
minUniqueValues <- 5 # more than 5 values and is not factor
correlationMeasureEvolution <- numeric()
sparcityEvolution <- numeric();
featVector <- data[,1];
# Function to calculate linear beta coefficients for a given feature: x_i=b*x_j+c
getAllBetaCoefficients <- function(feat,varlist=NULL)
{
allBetaCoef <- numeric(length(varlist));
featVector <- refdata[,feat]
# Fit the feature to another feature and returns the betaCoef
getBetaCoefficient <- function(x)
{
betaCoef <- 0;
modellm <- try(lm(x~featVector,model = FALSE,na.action=na.exclude))
if (!inherits(modellm, "try-error"))
{
f <- summary(modellm)$coefficients
# If not significant associatoin set it to zero
if (nrow(f)>1)
{
betaCoef <- modellm$coef[2]*(f[2,4] < adjunipvalue);
}
}
return (betaCoef)
}
# Apply the beta-estimation to all features
if (length(varlist) > 1)
{
allBetaCoef <- apply(refdata[,varlist],2,getBetaCoefficient)
}
else
{
allBetaCoef <- getBetaCoefficient(refdata[,varlist])
}
# If error set to zero
allBetaCoef[is.na(allBetaCoef)] <- 0;
allBetaCoef[is.nan(allBetaCoef)] <- 0;
allBetaCoef[is.infinite(allBetaCoef)] <- 0;
names(allBetaCoef) <- varlist
return (allBetaCoef)
}
if (thr >= 1.0) #largest correlation
{
thr <- 0.999
}
# Initializing variables
DeCorrmatrix <- NULL;
topFeatures <- character()
correlatedToBase <- character();
lavariables <- character();
lastintopfeat <- character();
lastdecorrelated <- character();
totused <- character();
totalpha <- character();
bfeat <- NULL
if (is.null(refdata))
{
refdata <- data;
}
dataids <- rownames(data)
refdataids <- rownames(refdata);
dataTransformed <- as.data.frame(rbind(data,refdata[!(refdataids %in% dataids),]));
if (is.null(drivingFeatures))
{
drivingFeatures <- character()
}
AdrivingFeatures <- character()
addedlist <- 1;
lp = 0;
uncorrelatedFetures <- character();
countf <- numeric(ncol(refdata));
names(countf) <- colnames(refdata);
# Variables to analyze for collinearity
varincluded <- colnames(refdata);
# If Outcome is present remove it from the variable list
if (!is.null(Outcome))
{
varincluded <- varincluded[!(varincluded %in% Outcome)];
}
# The character features
ischaracter <- sapply(refdata[,varincluded],class) == "character";
# The factors from the list
isFactor <- sapply(refdata[,varincluded],class) == "factor";
# Variables with not to many different values
isContinous <- unlist(lapply(lapply(refdata[,varincluded],unique),length)) >= minUniqueValues;
#The final list of variables that may have collinearity issues
varincluded <- varincluded[!isFactor & isContinous & !ischaracter];
fscore <- numeric(length(varincluded));
names(fscore) <- varincluded;
totFeatures <- length(varincluded);
models <- NULL
# Compute the initial correlation matrix. Rfast correlation is used in large matrices.
if ((useFastCor) && (length(varincluded)>1000))
{
cormat <- abs(Rfast::cora(as.matrix(refdata[,varincluded]),large=TRUE))
colnames(cormat) <- varincluded
rownames(cormat) <- varincluded
}
else
{
cormat <- abs(cor(refdata[,varincluded],method=method))
}
# Set to zero any NA
cormat[is.na(cormat)] <- 0
# Set to zero the diagonal
diag(cormat) <- 0;
# This may be debatable. But lets use this as an add hoc value for the estimation of the critical threshold
adjunipvalue <- min(c(unipvalue,3*unipvalue/totFeatures)); ## Adjusting for false association at 3 true associations per feature
# adjunipvalue <- unipvalue/totFeatures; ## Adjusting for false association
# Calculate critical correlation threshold
# Any correlation below this threshold is not significant
# If the specified thr is lower than the critical value thr is set to the critical value
ndf <- nrow(data)-2
tvalue <- qt(1.0 - adjunipvalue,ndf)
rcrit <- tvalue/sqrt(ndf + tvalue^2)
if (thr < rcrit) thr <- rcrit
# Initial values of variables
maxcor <- apply(cormat,2,max)
allFeatAdded <- character()
decordim <- 0
if (length(varincluded) > 1)
{
# If an outcome is specified, Let us get the variables associated to the outcome and use them as driving features
if (!is.null(Outcome))
{
if (length(drivingFeatures)==1)
{
if (drivingFeatures[1] == Outcome)
{
if (inherits(data[,Outcome],"factor") | (length(unique(data[,Outcome])) == 2))
{
outcomep <- univariate_correlation(data,Outcome,limit =-1) # the top associated features to the outcome
}
else
{
outcomep <- univariate_correlation(data,Outcome,method=method,limit= -1) # the top associated features to the outcome
}
drivingFeatures <- names(outcomep);
if (verbose) print(head(outcomep))
outcomep <- NULL;
}
}
}
# Only continuous variables
cormat <- cormat[varincluded,varincluded];
# Initial shape of the decorrelation matrix
DeCorrmatrix <- diag(length(varincluded));
colnames(DeCorrmatrix) <- varincluded;
rownames(DeCorrmatrix) <- varincluded;
decordim <- length(varincluded);
maxcor <- apply(cormat,2,max)
# Ranking the variables accordingly to their maximum association strength
ordera <- maxcor;
if (corRank)
{
# If the corRank is set the order now depends on the average correlation of top associated variables
axcor <- cormat;
axcor[axcor < 0.5*(1.0 + rcrit)] <- 0; ## Remove low correlated features
ordera <- 0.90*maxcor + 0.10*apply(axcor^2,2,mean); ## Add mean of top correlated
axcor <- NULL
}
AdrivingFeatures <- varincluded[order(-ordera[varincluded])];
if (length(drivingFeatures) > 0)
{
# The Adrivingfeatures define the order of association exploration. i.e., we want them to be unaltered basis
AdrivingFeatures <- unique(c(drivingFeatures,AdrivingFeatures))
AdrivingFeatures <- AdrivingFeatures[AdrivingFeatures %in% varincluded];
}
# We use a small trick to force the AdrivingFeatures to be at the top of the list of unaltered basis candidates.
ordera[AdrivingFeatures] <- c(length(AdrivingFeatures):1)
ordera <- ordera/length(AdrivingFeatures);
# Only features whose association is larger than the critical values will be analyzed
bfeat <- as.character(correlated_Remove(cormat,AdrivingFeatures,thr = rcrit,isDataCorMatrix=TRUE))
if (verbose)
{
cat ("\n",head(bfeat),"\n");
print(head(ordera));
cat ("\n Included:",length(varincluded),
", Uni p:",adjunipvalue,
", Base Size:",length(bfeat),
", Rcrit:",rcrit,
"\n")
}
thr2 <- thr # Storing the user specified target
thr <- 1.1*thr
# The algorithm will may loop using the following threshold values
thrvalues <- c(0.95,0.90,0.80,0.70,0.60,0.50,0.40,0.30,0.20,0.10,0.05);
nextthr <- 1;
nextthra <- 0;
skv <- 1
# Main loop for feature selection and decorrelation
while (((addedlist > 0) || (thr > thr2)) && (lp < maxLoops))
{
lp = lp + 1;
addedlist <- 0;
#The maximum correlation
maxcor <- apply(cormat,2,max)
mxScor <- max(maxcor); # The current maximum correlation in the current transformed dataset
mxScorm <- mean(c(maxcor[maxcor>=thr2],mxScor));
correlationMeasureEvolution <- c(correlationMeasureEvolution,mxScorm);
sparcityEvolution <- c(sparcityEvolution,sum(DeCorrmatrix == 0));
#Adjusting the loop correlation target
thr <- thr2;
if ((mxScor >= thr2) && relaxed)
{
# Adjust current loop thr if max correlation is still greater than the target thr.
nextthr <- sum(thrvalues > mxScor) + skv
thr <- max(thr2,thrvalues[nextthr]);
}
# Only features that have a correlation greater than "thr" will be decorrelated
topfeat <- varincluded;
names(topfeat) <- topfeat;
topfeat <- topfeat[maxcor[topfeat] >= thr];
if (verbose) cat("\n\r",lp,sprintf("<R=%5.3f,thr=%5.3f>",mxScorm,thr))
maxcomat <- NULL
if (length(topfeat)>0)
{
# Only if significant associations still exists
decorrelatedFetureList <- character();
betamatrix <- diag(length(varincluded));
colnames(betamatrix) <- varincluded;
rownames(betamatrix) <- varincluded;
# Get the minimum subset of features to be decorrelated
topfeat <- topfeat[order(-ordera[topfeat])];
atopbase <- bfeat[bfeat %in% topfeat];
topfeat <- unique(c(atopbase,topfeat));
topfeat <- correlated_Remove(cormat,topfeat,thr = thr,isDataCorMatrix=TRUE)
topfeat <- topfeat[order(-maxcor[topfeat])]; #topfeat has driving features still associated to other features
intopfeat <- character();
toBeDecorrelated <- length(topfeat)
if (verbose) cat(", Top:",toBeDecorrelated);
featAdded <- character();
if (toBeDecorrelated > 0)
{
# Only if there are features with significant correlation
if (toBeDecorrelated > 1)
{
maxcomat <- apply(cormat[,topfeat],1,max);
}
for (feat in topfeat)
{
# Loop over all driving features associated with other features
corlist <- cormat[,feat];
corlist <- corlist[corlist >= thr];
# Get the features that are correlated to the driving features
varlist <- names(corlist)
if ((toBeDecorrelated > 1) && (length(varlist) > 1))
{
varlist <- names(corlist[corlist >= maxcomat[varlist]])
}
varlist <- varlist[!(varlist %in% decorrelatedFetureList)]
if (length(varlist) > 0)
{
# Get the beta coefficients of the driving features to their corresponding associated pair
if (verbose && (feat==topfeat[1])) cat("<",length(varlist),">");
if (useFastCor)
{
#Linear Method
prebetas <- getAllBetaCoefficients(feat,varlist);
varlist <- varlist[prebetas != 0];
if (length(varlist) > 0)
{
betamatrix[feat,varlist] <- -1.0*prebetas[prebetas != 0];
}
}
else
{
#Robust methods
adataframe <- featureAdjustment(varlist,
baseFormula=feat,
data=dataTransformed[,c(feat,varlist)],
referenceframe=refdata[,c(feat,varlist)],
pvalue = adjunipvalue,
...
);
models <- attr(adataframe,"models")
attr(adataframe,"models") <- NULL
if (length(models) > 0)
{
adjusted <- numeric(length(varlist)) == 1;
names(adjusted) <- varlist;
for (vl in 1:length(models))
{
adjusted[models[[vl]]$feature] <- (models[[vl]]$pval < adjunipvalue);
if (adjusted[models[[vl]]$feature])
{
if (is.null(models[[vl]]$model$coef))
{
betamatrix[feat,models[[vl]]$feature] <- 1.0;
useDeCorr <- FALSE;
}
else
{
if (!is.na(models[[vl]]$model$coef[2]))
{
betamatrix[feat,models[[vl]]$feature] <- -1.0*models[[vl]]$model$coef[2];
}
else
{
adjusted[models[[vl]]$feature] <- FALSE;
}
}
}
}
varlist <- varlist[adjusted];
if (length(varlist) > 0)
{
dataTransformed[,c(feat,varlist)] <- adataframe[,c(feat,varlist)];
refdata[,c(feat,varlist)] <- adataframe[refdataids,c(feat,varlist)];
}
}
}
if (length(varlist) > 0)
{
# If any variable was adjusted, we do inloop bookkeeping
featAdded <- c(featAdded,feat);
intopfeat <- unique(c(intopfeat,feat));
countf[varlist] <- countf[varlist] + 1;
decorrelatedFetureList <- c(decorrelatedFetureList,varlist);
fscore[feat] <- fscore[feat] + length(varlist);
fscore[varlist] <- fscore[varlist] - 1;
if (toBeDecorrelated > 1)
{
maxcomat[varlist] <- 0;
}
if (verbose && (length(intopfeat) %% 100 == 99)) cat(".")
}
}
}
# Store added features
allFeatAdded <- unique(c(allFeatAdded,featAdded))
ordera[featAdded] <- ordera[featAdded] + 1;
}
addedlist <- length(decorrelatedFetureList);
if (verbose) cat("[Fa=",length(allFeatAdded),"](",length(intopfeat),",",addedlist,",",length(totalpha),"),<")
colused <- character()
if (addedlist > 0)
{
# If features were decorrelated we do the new estimation of the transformation matrix
mbetas <- c(intopfeat,decorrelatedFetureList);
totused <- unique(c(totused,mbetas));
totalpha <- unique(c(totalpha,intopfeat));
allused <- varincluded[varincluded %in% totused];
colused <- varincluded[varincluded %in% decorrelatedFetureList];
alphaused <- varincluded[varincluded %in% intopfeat];
totalphaused <- varincluded[varincluded %in% totalpha];
if ((length(colused) == 1) && (length(allused) == 1))
{
# Updating the transformation matrix for a single variable change
DeCorrmatrix[totalphaused,colused] <- as.numeric(DeCorrmatrix[totalphaused,allused]) *
as.numeric(betamatrix[allused,colused]);
}
else
{
# Updating the transformation matrix for more than one change
if (length(totalphaused) > 1)
{
DeCorrmatrix[totalphaused,colused] <- Rfast::mat.mult( as.matrix(DeCorrmatrix[totalphaused,allused]),
as.matrix(betamatrix[allused,colused])
);
}
else
{
mtx1 <- t(as.matrix(DeCorrmatrix[totalphaused,allused]))
mtx2 <- as.matrix(betamatrix[allused,colused])
DeCorrmatrix[totalphaused,colused] <- mtx1 %*% mtx2;
}
}
if (useFastCor)
{
# Update the transformed data set
if (verbose) cat("|")
if (( length(colused) == 1) && (length(alphaused) == 1 ))
{
refdata[,colused] <- refdata[,colused] + as.numeric(refdata[,alphaused])*as.numeric(betamatrix[alphaused,colused])
}
else
{
if (length(alphaused) > 1)
{
refdata[,colused] <- refdata[,colused] + Rfast::mat.mult(as.matrix(refdata[,alphaused]),as.matrix(betamatrix[alphaused,colused]))
}
else
{
mtx1 <- as.matrix(refdata[,alphaused])
mtx2 <- t(as.matrix(betamatrix[alphaused,colused]))
refdata[,colused] <- refdata[,colused] + mtx1 %*% mtx2;
}
}
}
}
if (verbose) cat("><")
if (length(colused) > 0)
{
colsd <- apply(as.matrix(refdata[,colused]),2,sd,na.rm = TRUE);
if (sum(colsd==0) > 0)
{
# We do not like zero residuals. i.e., we add some minor noise
zerovar <- colused[colsd==0];
for (zcheck in zerovar)
{
refdata[,zcheck] <- refdata[,zcheck] + rnorm(nrow(refdata),0,1.0e-10);
}
}
}
# Recompute correlation matrix
topFeatures <- unique(c(topFeatures,intopfeat));
if ((useFastCor) && (length(colused)>1000))
{
# Fast correlation for large matrices and linear fitting
cormat <- abs(Rfast::cora(as.matrix(refdata[,varincluded]),large=TRUE))
colnames(cormat) <- varincluded
rownames(cormat) <- varincluded
}
else
{
if (length(colused) > 0)
{
cormatS <- abs(cor(refdata[,varincluded],refdata[,colused],method=method))
cormat[,colused] <- cormatS
cormat[colused,] <- t(cormatS)
cormatS <- NULL
}
}
if (verbose) cat(">")
cormat[is.na(cormat)] <- 0;
diag(cormat) <- 0;
if (verbose)
{
cat ("Tot Used:",length(totused),", Added:",addedlist,", Zero Std:",sum(colsd==0),", Max Cor:",sprintf("%5.3f",max(cormat)));
}
if ( (length(intopfeat) == length(lastintopfeat)) && (length(decorrelatedFetureList) == length(lastdecorrelated)) )
{
if ((sum(intopfeat %in% lastintopfeat) == length(intopfeat)) && (sum(decorrelatedFetureList %in% lastdecorrelated) == length(decorrelatedFetureList)))
{
addedlist <- 0;
}
}
lastdecorrelated <- decorrelatedFetureList;
lastintopfeat <- intopfeat;
}
mxScor <- max(cormat);
nextthr <- sum(thrvalues > mxScor) + 1
#
if ((nextthr == nextthra) && (addedlist == 0) && (mxScor > thr))
{
# loop faster if there were no changes
lp <- lp + 2
}
if ((addedlist == 0) && (mxScor > thr))
{
# Add to check the next threshold value
skv <- skv + 1
}
nextthra <- sum(thrvalues > mxScor) + skv
}
# The transformation has been completed. Next we do the final estimations
if (lp >= maxLoops)
{
warning("Maximum number of iterations reached: Failed to achieve target correlation\n")
}
# Do the final bookkeeping: Get the final transformation matrix and transform the input data
correlationMeasureEvolution <- c(correlationMeasureEvolution,max(cormat));
sparcityEvolution <- c(sparcityEvolution,sum(DeCorrmatrix == 0));
betamatrix <- NULL;
tmparincluded <- varincluded;
# Only variables involved in transformation
varincluded <- tmparincluded[tmparincluded %in% totused];
bfeat <- bfeat[bfeat %in% varincluded];
if (useDeCorr)
{
# If estimating transformations let do the final transformation of the observed data
DeCorrmatrix <- DeCorrmatrix[varincluded,varincluded] # Remove variables that were not used in the transformation
abfeat <- varincluded[apply(DeCorrmatrix!=0,2,sum)==1]
bfeat <- unique(c(bfeat,abfeat));
dataTransformed <- data
if (length(varincluded) > 1)
{
# Transform the observed data
dataTransformed[,varincluded] <- Rfast::mat.mult(as.matrix(data[,varincluded]),DeCorrmatrix);
# Observing internal behavior
correlatedToBase <- varincluded[varincluded %in% bfeat];
if (length(correlatedToBase) > 1)
{
correlatedToBase <- apply(DeCorrmatrix[correlatedToBase,] != 0,2,sum)
correlatedToBase <- names(correlatedToBase[correlatedToBase > 0])
}
else
{
if (length(correlatedToBase) == 1)
{
correlatedToBase <- varincluded[DeCorrmatrix[correlatedToBase,] != 0]
}
}
correlatedToBase <- correlatedToBase[!(correlatedToBase %in% bfeat)];
if (verbose)
{
# Recompute correlation matrix only in verbose mode
if ((useFastCor) && (length(varincluded)>1000))
{
cormat <- abs(Rfast::cora(as.matrix(dataTransformed[,varincluded]),large=TRUE))
colnames(cormat) <- varincluded
rownames(cormat) <- varincluded
}
else
{
cormat <- abs(cor(dataTransformed[,varincluded],method=method))
}
diag(cormat) <- 0;
cormat[is.na(cormat)] <- 0;
cat ("\n\r [",lp,"],",max(cormat),"Decor Dimension:",length(varincluded),"Nused:",length(totused),". Cor to Base:",length(correlatedToBase),", ABase:",length(AdrivingFeatures),", Outcome Base:",length(drivingFeatures),"\n\r")
}
# One last step: Update the column names to reflect that they are latent variables
changed <- (apply(DeCorrmatrix!=0,2,sum) > 1);
lavariables <- colnames(DeCorrmatrix)[changed]
newnames <- colnames(dataTransformed)
newnames[newnames %in% lavariables] <- paste("La_",newnames[newnames %in% lavariables],sep="")
colnames(dataTransformed) <- newnames
newnames <- colnames(DeCorrmatrix)
newnames[newnames %in% lavariables] <- paste("La_",newnames[newnames %in% lavariables],sep="")
colnames(DeCorrmatrix) <- newnames
newnames <- names(fscore)
newnames[newnames %in% lavariables] <- paste("La_",newnames[newnames %in% lavariables],sep="")
names(fscore) <- newnames
}
}
else
{
dataTransformed <- dataTransformed[dataids,];
}
}
# Store evolution data
IDeAEvolution <-list(Corr=correlationMeasureEvolution,Spar=sparcityEvolution/(decordim^2));
# The VarRatio stores the amount of explained variance
VarRatio <- apply(dataTransformed[,colnames(DeCorrmatrix)],2,var);
VarRatio <- VarRatio/apply(data[,rownames(DeCorrmatrix)],2,var);
VarRatio <- VarRatio[order(-VarRatio)]
noaltered <- colnames(data)[!(colnames(data) %in% rownames(DeCorrmatrix))]
if (length(noaltered)>0)
{
varone <- rep(1,length(noaltered))
names(varone) <- noaltered
VarRatio <- c(varone,VarRatio)
}
# Return attributes
attr(dataTransformed,"UPLTM") <- DeCorrmatrix;
attr(dataTransformed,"fscore") <- fscore;
attr(dataTransformed,"TotalAdjustments") <- countf;
attr(dataTransformed,"drivingFeatures") <- AdrivingFeatures;
attr(dataTransformed,"unaltered") <- bfeat;
attr(dataTransformed,"LatentVariables") <- lavariables;
attr(dataTransformed,"useDeCorr") <- useDeCorr;
attr(dataTransformed,"unipvalue") <- adjunipvalue;
attr(dataTransformed,"R.critical") <- rcrit;
attr(dataTransformed,"IDeAEvolution") <- IDeAEvolution;
attr(dataTransformed,"VarRatio") <- VarRatio;
return(dataTransformed)
}
# transform new data
predictDecorrelate <- function(decorrelatedobject,testData)
{
if (attr(decorrelatedobject,"useDeCorr") && !is.null(attr(decorrelatedobject,"UPLTM")))
{
decorMat <- attr(decorrelatedobject,"UPLTM")
testData[,rownames(decorMat)] <- Rfast::mat.mult(as.matrix(testData[,rownames(decorMat)]),decorMat);
newnames <- colnames(testData)
lavariables <- attr(decorrelatedobject,"LatentVariables");
newnames[newnames %in% lavariables] <- paste("La_",newnames[newnames %in% lavariables],sep="")
colnames(testData) <- newnames
}
else
{
warning("The object does not have a decorrelation Matrix\n")
}
return (testData)
}
# Get the latent variable formulas as characters
getlatentCharFormula <- function(latentCoeff)
{
deFromula <- character(length(latentCoeff))
names(deFromula) <- names(latentCoeff)
for (dx in names(deFromula))
{
coef <- latentCoeff[[dx]]
cname <- names(latentCoeff[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf] != 0)
{
if (abs(log10(abs(coef[cf])))<=2)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ (%5.3f)%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("- (%.3f)%s",abs(coef[cf]),cname[cf]))
}
}
else
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ (%.2e)%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("- (%.2e)%s",abs(coef[cf]),cname[cf]))
}
}
}
deFromula[dx] <- str_replace_all(deFromula[dx],"\\(1.000\\)","")
}
}
}
return(deFromula)
}
# Get the transformation coefficients for each latent variable
getLatentCoefficients <- function(decorrelatedobject)
{
CoeffList <- list();
nonZeronames <- character();
UPLTM <- attr(decorrelatedobject,"UPLTM")
namecol <- colnames(UPLTM);
if (length(namecol) > 0)
{
n=1;
for (i in 1:ncol(UPLTM))
{
associ <- abs(UPLTM[,i])>0;
if (sum(associ) > 1)
{
nonZeronames <- append(nonZeronames,namecol[i]);
CoeffList[[n]] <- UPLTM[associ,i];
n=n+1;
}
}
names(CoeffList) <- nonZeronames;
}
attr(CoeffList,"LatentCharFormulas") <- getlatentCharFormula(CoeffList)
return (CoeffList)
}
## Get the coefficients associated with each one of the observed features
## The latentModel coefficients input should contain the linear coefficients
getObservedCoef <- function(decorrelatedobject,latentModel)
{
latentCoef <- latentModel
Outcome <- NULL;
if (!is.null(latentModel$formula))
{
Outcome <- as.character(latentModel$formula)[2]
latentCoef <- latentModel$coef
}
UPLTM <- attr(decorrelatedobject,"UPLTM")
laBetas <- numeric(ncol(UPLTM))
names(laBetas) <- colnames(UPLTM)
namesin <- names(latentCoef) %in% colnames(UPLTM)
lanamesinUPLTM <- names(latentCoef)[namesin]
lanamesNotinUPLTM <- names(latentCoef)[!namesin]
laBetas[lanamesinUPLTM] <- latentCoef[lanamesinUPLTM]
obsCoef <- c(latentCoef[lanamesNotinUPLTM],t(UPLTM %*% laBetas))
names(obsCoef) <- c(lanamesNotinUPLTM,rownames(UPLTM))
obsCoef <- obsCoef[obsCoef!=0]
theformula <- str_flatten(names(obsCoef)," + ")
theformula <- str_replace_all(theformula,"\\(Intercept\\)","1")
theformula <- paste(Outcome,"~",theformula)
obsCoef <- list(formula=theformula,
coefficients=obsCoef,
LatentCharFormulas=attr(getLatentCoefficients(decorrelatedobject),"LatentCharFormulas")
)
class(obsCoef) <- c("ObservedModel")
return (obsCoef)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.