#library(selbal)
# Define the function selbal
selbal <- function(x, y, th.imp = 0, covar = NULL, logit.acc="AUC",
logt=T, col = c("steelblue1", "tomato1"), tab=T,
draw=T, maxV = 1e10, zero.rep = "bayes"){
# y1=as.numeric(y)
# nulldev0<-deviance(glm(y1~1))
#
#----------------------------------------------------------------------------#
# STEP 0: load libraries and extract information
#----------------------------------------------------------------------------#
#----------------------------------------------#
# 0.1: information about the response variable
#----------------------------------------------#
# Class of the response variable
classy <- class(y)
# Family for the glm (default: gaussian)
f.class <- "gaussian"
# numy to be y and it will be modified if y is a factor
numy <- y
# If y is a factor, compute the number of levels of y
if (classy == "factor"){
ylev <- levels(y)
numy <- as.numeric(y) - 1
f.class <- "binomial"
# nulldev0<-deviance(glm(numy~1, family=f.class))
}
#------------------------------------------------------------------#
# 0.2: information and transformation of the independent variables
#------------------------------------------------------------------#
# Load library
suppressMessages(library(zCompositions))
# Build a table with the response variable and covariates for correction
if (!is.null(covar)){ dat <- data.frame(cbind(numy, covar))
} else { dat <-data.frame(numy)}
# The logCounts (with zero replacement)
if (logt == F){ logCounts <- x
} else{
logCounts <- log(cmultRepl2(x, zero.rep = zero.rep))
}
# Variables name
var.nam <- rem.nam <- colnames(x)
#--------------------------------------------------------------------------#
# 0.3: auxiliar functions
#--------------------------------------------------------------------------#
#--------------------------------------------------------------------------#
# Auxiliar function to compute the first balance
#--------------------------------------------------------------------------#
first.bal<- function(logCounts, Y, covar=NULL){
#------------------------------------------------------------------------#
# STEP 0: extract information
#------------------------------------------------------------------------#
# Number and name of variables
n <- ncol(logCounts)
nam <- colnames(logCounts)
#------------------------------------------------------------------------#
# STEP 1: build the output matrix
#------------------------------------------------------------------------#
# The matrix
if (classy=="factor"){ M<-matrix(0, nrow=n, ncol=n)
}else{ M<-matrix(1e99, nrow=n, ncol=n)}
# Row.names and colnames
row.names(M)<-colnames(M)<-nam
#------------------------------------------------------------------------#
# STEP 2: complete the matrix
#------------------------------------------------------------------------#
if(classy=="factor"){
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
for (i in 2:n){
for (j in 1:(i-1)){
# Build a table with the information
TAB <- data.frame(cbind(Y,logCounts[,i]-logCounts[,j],covar))
# Fit the regression model
FIT <- glm(Y ~ .,data=TAB, family = f.class)
# Add the value into the matrix
ifelse(FIT$coefficients[2]>0,
M[i,j] <- logit.cor(FIT, y = y, covar = covar, logit.acc), #CANVI Y
M[j,i] <- logit.cor(FIT, y = y, covar = covar, logit.acc)) #CANVI Y
} # End j
} # End i
# Indices for the highest logit.cor value
r <- which(M == max(M), arr.ind = T)
} else {
for (i in 2:n){
for (j in 1:(i-1)){
# Build a table with the information
TAB <- data.frame(cbind(Y,logCounts[,i]-logCounts[,j],covar))
# Fit the regression model
FIT <- glm(Y ~ .,data=TAB, family = f.class)
# Complete the matrix
ifelse(FIT$coefficients[2]>0,
M[i,j] <- mean(FIT$residuals^2),
M[j,i] <- mean(FIT$residuals^2))
} # End j
} # End i
# Indices for the lowest MSE value
r <- which(M == min(M), arr.ind = T)
}
#browser()
# Return the row and column of the maximum value
return(r)
}
#--------------------------------------------------------------------------#
#--------------------------------------------------------------------------#
# Auxiliar function to compute the "association value" when adding a new
# variable into the balance*
#--------------------------------------------------------------------------#
balance.adding.cor <- function(x, LogCounts, POS, NEG, numy, covar=NULL){
#----------------------------------------#
# If x added into the numerator, . .
#----------------------------------------#
# The "numerator"
S1.pos <- rowM(LogCounts[,c(POS,x)]); s1 <- length(POS) + 1
# The "denominator"
S2.pos <- rowM(LogCounts[,NEG]) ; s2 <- length(NEG)
# The balance
BAL <- sqrt((s1*s2)/(s1+s2))*(S1.pos - S2.pos)
# Data.frame with the variables
D.pos <- data.frame(cbind(numy, BAL, covar))
# Regression model
FIT.pos <- glm(numy~., data=D.pos, family=f.class)
# The MSE or the corresponding value for dichotomous responses
if(classy=="numeric"){ C.pos <- mean(FIT.pos$residuals^2)
# }else{ C.pos <- logit.cor(FIT.pos,numy,covar = covar, logit.acc)} #diferencies
}else{ C.pos <- logit.cor(FIT.pos,y,covar = covar, logit.acc)} #diferencies
#----------------------------------------#
# If x added into the numerator, . .
#----------------------------------------#
# The numerator
S1.neg <- rowM(LogCounts[,POS]) ; s1 <- length(POS)
# The denominator
S2.neg <- rowM(LogCounts[,c(NEG,x)]) ; s2 <- length(NEG) + 1
# The balance
BAL <- sqrt((s1*s2)/(s1+s2))*(S1.neg - S2.neg)
# Data.frame with the variables
D.neg <- data.frame(cbind(numy, BAL, covar))
# Regression model
FIT.neg <- glm(numy~., data=D.neg, family=f.class)
# The MSE or the corresponding value for dichotomous responses
if(classy=="numeric"){ C.neg <- mean(FIT.neg$residuals^2)
#}else{ C.neg <- logit.cor(FIT.neg,numy,covar = covar, logit.acc)} #diferencies
}else{ C.neg <- logit.cor(FIT.neg,y,covar = covar, logit.acc)} #diferencies
# Correlation values
COR <- c(C.pos, C.neg)
# Return the values
return(COR)
}
#------------------------------------------------------------------------------#
#--------------------------------------------------------------------------#
# STEP 1: depending on the response variable class, . . .
#--------------------------------------------------------------------------#
# Define the first balance
A1 <- first.bal(logCounts, Y = numy, covar=covar)
# Variables taking parti into the first balance
POS <- colnames(logCounts)[A1[1,1]]
NEG <- colnames(logCounts)[A1[1,2]]
# Included variables in the model
INC.VAR <- c(POS, NEG)
# Delete these variables from rem.nam
rem.nam <- setdiff(rem.nam, INC.VAR)
# Define the initial balance (B_1)
S1 <- logCounts[,POS]
S2 <- logCounts[,NEG]
# Assign the values to B
B <- sqrt(1/2)*(S1 - S2)
#--------------------------------------------------------------------------#
# Information about the ACC for the Balance values
#--------------------------------------------------------------------------#
# Build a new data.frame
dat.ini <- cbind(dat, B)
# Fit the regression model
FIT.initial <- glm(numy ~ .,data=dat.ini, family = f.class)
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
# Define the initial "accuracy" or "association" value
if(classy=="numeric"){ ACC.Bal <- mean(FIT.initial$residuals^2)
}else{ ACC.Bal <- logit.cor(FIT.initial, y, covar = covar, logit.acc)} #diferencies numy
# }else{ ACC.Bal <- logit.cor(FIT.initial, numy, covar = covar, logit.acc)} #new diferencies numy
#browser()
#----------------------------------------------------------------------------#
# ACC reference
ACC.ref <- ACC.Bal
#------------------------------------------------------------------------#
# Improve the balances
#------------------------------------------------------------------------#
# Define some parameters
# The p.value to compare 2 balances (one of them with an additional
# variable)
ACC.set <- ACC.ref
# Generate a data.frame with the relevant information
if(tab){
# Build a data.frame with the information
EVOL <- data.frame(POS,NEG,ACC.ref, ACC.ref)
colnames(EVOL) <- c("NUMERATOR","DENOMINATOR", "ACC", "Increase")
# Index of factor (character) columns
f.indx <- sapply(EVOL, is.factor)
}
# Index of the number of variables for the balance
nV <- 2
#------------------------------#
# For numeric responses, . . .
#------------------------------#
if (classy=="numeric"){
# While there is an improvement and the maximun number of variables
# has not been reached, . . .
while (ACC.set <= ACC.ref && length(rem.nam)!=0 && nV<maxV){
# The new p.bal.ref is the p.set of the previous step
ACC.ref <- ACC.set
# Function to extract the p-value
add2bal.ACC <- matrix(0, nrow = 0, ncol = 2)
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
# Extract the p-values
add2bal.ACC <- t(sapply(rem.nam, function(x)
balance.adding.cor(x, LogCounts = logCounts, POS, NEG, numy = numy,
covar = covar)))
# Add names to the rows
row.names(add2bal.ACC) <- rem.nam
# Extract which is the variable (only the first row)
ACC.opt <- which(add2bal.ACC==min(add2bal.ACC),arr.ind = T)[1,]
# Modify p.set
ACC.set <- min(add2bal.ACC)
# If there is an improvement, . . .
#if (abs(ACC.set - ACC.ref) > th.imp){
if ((ACC.set - ACC.ref) < th.imp){
INC.VAR <- c(INC.VAR, rem.nam[ACC.opt[1]])
ACC.Bal <- c(ACC.Bal, ACC.set)
nV <- nV + 1
if (ACC.opt[2]==1){
POS <- c(POS, rem.nam[ACC.opt[1]])
if(tab){
EVOL[f.indx] <- lapply(EVOL[f.indx], as.character)
EVOL <- rbind(EVOL, c(rem.nam[ACC.opt[1]], "-", ACC.set,
ACC.set - ACC.ref))}
} else if (ACC.opt[2]==2){
NEG <- c(NEG, rem.nam[ACC.opt[1]])
if(tab){
EVOL[f.indx] <- lapply(EVOL[f.indx], as.character)
EVOL <- rbind(EVOL, c("-", rem.nam[ACC.opt[1]], ACC.set,
ACC.set - ACC.ref))}
}
} else {ACC.set <- 0 }
# Remainng variables (possible to add to the balance)
rem.nam <- rem.nam[-ACC.opt[1]]
} # End while
}else{
#-----------------------------------#
# For non-numeric responses, . . .
#-----------------------------------#
# While there is an improvement and the maximun number of variables has not
# been reached, . . .
while (ACC.set >= ACC.ref && length(rem.nam)!=0 && nV<maxV){
# The new p.bal.ref is the p.set of the previous step
ACC.ref <- ACC.set
# Function to extract the p-value
add2bal.ACC <- matrix(0, nrow = 0, ncol = 2)
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
# Extract the p-values
add2bal.ACC <- t(sapply(rem.nam, function(x)
balance.adding.cor(x, LogCounts = logCounts, POS, NEG, numy = numy,
covar = covar)))
# Add names to the rows
row.names(add2bal.ACC) <- rem.nam
# Extract which is the variable (only the first row)
ACC.opt <- which(add2bal.ACC==max(add2bal.ACC),arr.ind = T)[1,]
# Modify p.set
ACC.set <- max(add2bal.ACC)
# If there is an improvement, . . .
if ((ACC.set - ACC.ref) > th.imp){
# Add the included variable
INC.VAR <- c(INC.VAR, rem.nam[ACC.opt[1]])
# Add the Accuracy value
ACC.Bal <- c(ACC.Bal, ACC.set)
# A new variable into the balance
nV <- nV + 1
# Variable included into NUMERATOR or DENOMINATOR?
if (ACC.opt[2]==1){
POS <- c(POS, rem.nam[ACC.opt[1]])
if(tab){
EVOL[f.indx] <- lapply(EVOL[f.indx], as.character)
EVOL <- rbind(EVOL, c(rem.nam[ACC.opt[1]], "-", ACC.set,
ACC.set - ACC.ref))}
} else if (ACC.opt[2]==2){
NEG <- c(NEG, rem.nam[ACC.opt[1]])
if(tab){
EVOL[f.indx] <- lapply(EVOL[f.indx], as.character)
EVOL <- rbind(EVOL, c("-", rem.nam[ACC.opt[1]], ACC.set,
ACC.set - ACC.ref))}
}
} else {ACC.set <- 0 }
# Remainng variables (possible to add to the balance)
rem.nam <- rem.nam[-ACC.opt[1]]
}
}
# K1 and k2
k1 <- length(POS); k2 <- length(NEG)
# The final Balance
FINAL.BAL <- sqrt((k1*k2)/(k1+k2))*
(rowM(logCounts[,POS])- rowM(logCounts[,NEG]))
#----------------------------------------------------------------------------#
# FINAL GLM
#----------------------------------------------------------------------------#
# Auxiliar data.frame for graphical representation
U <- data.frame(dat, FINAL.BAL)
colnames(U)[ncol(U)] <- "V1"
# Regression model
FIT.final <- glm(numy~., data=U, family = f.class)
# Draw the plot if draw == T
if (draw){
#----------------------------------------------------------------------------#
# GRAPHICAL REPRESENTATION
#----------------------------------------------------------------------------#
# Load library
library(ggplot2)
#-----------------------------------------#
# FIRST: The names of included variables
#-----------------------------------------#
# Variables included
T1 <- c("NUMERATOR", POS)
T2 <- c("DENOMINATOR",NEG)
# Parameter to specify the limits for writting
yl <- max(length(T1), length(T2)) + .5
# Empty plot with text
df <- data.frame()
Imp.table <- ggplot(df) + xlim(0, 100) + ylim(-0.5, yl) + theme_void() +
annotate("text",
x = 75,
y = floor(yl):ceiling(yl-length(T1)),
label = T1,
colour = c("royalblue1",rep("black",length(T1)-1)),
fontface = 2) +
annotate("text",
x = 25,
y = floor(yl):ceiling(yl-length(T2)),
label = T2,
colour = c("royalblue1",rep("black",length(T2)-1)),
fontface = 2)
# Parameter 2 to specify the limits for writting
T1 <- c(POS,"A")
T2 <- c(NEG,"B")
yl2 <- max(length(T1), length(T2));
yl2 <- yl2*1.05;
escal <- 3;
bot <- 5*escal*yl2/100;
# Empty plot 2 with text
ndiv = max(3,floor(yl2))+1;
lineh <- 0 # 0.5*ceiling(yl-length(T1));
df2 <- data.frame()
colbalance<-colbalance<-"brown3"
Imp.table2 <- ggplot(df2) + xlim(0, 100) + ylim(-bot, ndiv) + theme_void() +
geom_segment(aes(x = 10, y = lineh, xend = 90, yend = lineh),color=colbalance, size=1.3) +
geom_segment(aes(x = 50-escal, y = lineh-bot, xend = 50+escal, yend = lineh-bot),color=colbalance, size=1) +
geom_segment(aes(x = 50+escal, y = lineh-bot, xend = 50+escal, yend = lineh),color=colbalance, size=1) +
geom_segment(aes(x = 50-escal, y = lineh-bot, xend = 50-escal, yend = lineh),color=colbalance, size=1) +
annotate("text",
x = 75,
y = seq(bot, floor(yl2), length.out = ndiv),
label = c(T1,rep("",ndiv-length(T1))),
colour = c(rep("black",length(T1)-1),rep(colbalance,ndiv-length(T1)+1)),
fontface = 2) +
annotate("text",
x = 25,
y = seq(bot, floor(yl2), length.out = ndiv),
label = c(T2,rep("",ndiv-length(T2))),
colour = c(rep("black",length(T2)-1),rep(colbalance,ndiv-length(T2)+1)),
fontface = 2)
#-----------------------------------------#
# SECOND: The representation of the plots
#-----------------------------------------#
# The plot depending of the class of the response variable
if (classy=="factor"){
#---------------------------------------------------#
# The composition of the final plot
#---------------------------------------------------#
# BOXPLOT 1
BoxP <- ggplot(U, aes(x=y, y=V1, fill=y)) +
geom_boxplot(color="black", size=1) +
scale_fill_manual(values=col) +
theme_bw() +
ylab("Balance") +
xlab("Factor") +
theme(legend.position = "none")
# BOXPLOT 2
BoxP2 <- ggplot(U, aes(x=y, y=V1, fill=y)) +
geom_boxplot(color="black", size=1) +
scale_fill_manual(values=col) +
theme_bw() +
ylab("Balance") +
xlab("") +
theme(legend.position = "none")+coord_flip()
# Density plot 1 for the balance
ydensity <- ggplot(U, aes(V1, fill=y)) +
geom_density(alpha=.5, size=1.25) +
scale_fill_manual(values = col) +
theme_bw() + xlab("") + ylab("") +
theme(legend.position = "none",
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
coord_flip()
# Density plot 2 for the balance
ydensity2 <- ggplot(U, aes(V1, fill=y)) +
geom_density(alpha=.5, size=1) +
scale_fill_manual(values = col) +
theme_bw() + xlab("") + ylab("") +
theme(legend.position = "none")
# ROC - curve
suppressMessages(library(pROC))
# Build ROC curve
A<-roc(response = U$numy,predictor = FIT.final$fitted.values, quiet = TRUE)
# Extract the sensitivity and specificiti values
ROC.TAB <- data.frame(x=1-A$specificities, y=A$sensitivities)
# Order them for a correct representation
ROC.TAB <- ROC.TAB[order(ROC.TAB$y),]
# AUC value
#auc.val <- round(logit.cor(FIT.final,y = U$numy, covar = covar, logit.acc = logit.acc),3)
#auc.val<-round(as.numeric(auc(y, FIT.final$fitted.values, quiet=TRUE)),3) # diferencies
auc.val<-round(as.numeric(auc(U$numy, FIT.final$fitted.values, quiet=TRUE)),3) # new diferencies
# The plot
ROC.plot <- ggplot(data=ROC.TAB, aes(x=x, y=y)) +
geom_line() +
ggtitle("ROC curve") +
xlab("FPR") + ylab("TPR") +
geom_step() +
annotate("text", x = .75, y = .2,
label = paste("AUC-ROC \n",auc.val), size = 2.5) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
# Load libraries
library("gridExtra")
library("grid")
FINAL.P <- arrangeGrob(Imp.table, ROC.plot, BoxP, ydensity,
ncol=2, nrow=2, widths=c(5,1.25), heights=c(2, 5),
vp=viewport(width=0.8, height=0.8))
library(gtable)
g1 <- ggplotGrob(Imp.table2)
g2 <- ggplotGrob(BoxP2)
g3 <- ggplotGrob(ydensity2)
g <- rbind(g1, g2, size = "first")
g <- rbind(g, g3, size = "first")
g$widths <- unit.pmax(g1$widths,g2$widths)
FINAL.P2 <- g
} else {
# Fit the regression model
FIT.p <- glm(y ~ FINAL.BAL, family = f.class)
PLOT.G <- ggplot(U, aes(V1, y)) +
geom_point(colour = "black", size = 3) +
geom_abline(intercept=FIT.p$coefficients[1],
slope=FIT.p$coefficients[2], lwd=3, col="blue") +
theme_bw() +
xlab("Balance value") + ylab("Response variable") +
ggtitle("") +
theme(strip.text.x = element_text(size=12, angle=0,
face="bold",colour="white"),
strip.text.y = element_text(size=12, face="bold"),
strip.background = element_rect(colour="black",
fill="black"),
plot.title = element_text(size=20, vjust=2.25, hjust= .5,
face = "bold"),
legend.title = element_text(face="bold"),
legend.text = element_text(face="bold"))
# Load libraries
library(grid)
library(gridExtra)
FINAL.P <- arrangeGrob(Imp.table, PLOT.G, nrow=2,
heights=c(0.2,0.5),vp=viewport(width=0.8,
height=0.8))
FINAL.P2 <- arrangeGrob(Imp.table2, PLOT.G, nrow=2,
heights=c(1,2),vp=viewport(width=0.8,
height=0.8))
}
# Draw the plot if draw == T
grid.draw(FINAL.P)
} #end plot
if (classy=="numeric") ROC.plot=NULL
# Round the values
if(tab){
EVOL[,3]<-round(as.numeric(EVOL[,3]),5)
EVOL[,4]<-round(as.numeric(EVOL[,4]),5)
if (draw== TRUE){
L <- list(balance.values=FINAL.BAL, numerator=POS, denominator=NEG, balance=INC.VAR, accuracy=ACC.Bal, balance.selection=EVOL, global.plot=FINAL.P,
glm=FIT.final, global.plot2 = FINAL.P2, ROC.plot = ROC.plot)
} else {
L <- list(balance.values=FINAL.BAL, numerator=POS, denominator=NEG, balance=INC.VAR, accuracy=ACC.Bal, balance.selection=EVOL, glm=FIT.final)
}
} else {
if (draw== TRUE){
L <- list(balance.values=FINAL.BAL, numerator=POS, denominator=NEG, balance=INC.VAR, accuracy=ACC.Bal, global.plot=FINAL.P,
glm=FIT.final,global.plot2 = FINAL.P, ROC.plot = ROC.plot)
} else {
L <- list(balance.values=FINAL.BAL, numerator=POS, denominator=NEG, balance=INC.VAR, accuracy=ACC.Bal, glm=FIT.final)
}
}
return(L)
}
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# FUNCTION: selbal.cv
# INPUT:
# x: matrix with counts
# y: response variable
# n.fold: number of folds for each iteration
# n.iter: number of iterations
# seed: seed for results replication (used for build the groups)
# th.imp: threshold of improvement when adding a variable. If the
# improvement is not higher than this quantity, it stops.
# covar: data.frame with covariates
# logit.acc: the accuracy value to return when a logit regression is made
# user_numVar: number of variables the user decides to include
# OUTPUT:
# $NUM: variables included in the numerator (whole data set)
# $DEN: variables included in the denominator (whole data set)
# $ROB.TAB: table of "robustness" with the percentage of times that the
# variable appears in the cross - validation model
# $GLOBAL.ACC: ACC for the whole data set after computing the balance
#------------------------------------------------------------------------------#
#' Cross - validation process for the selection of the optimal number of
#' variables and robustness evaluation
#'
#'
#' @param x a \code{matrix} object with the information of variables
#' (\emph{columns}) for each sample (\emph{rows}).
#' @param y the response variable, either continuous or dichotomous.
#' @param n.fold number of folds in which to divide the whole data set.
#' @param n.iter number of iterations for the cross - validation process.
#' @param seed a seed to make the results reproducible.
#' @param th.imp the minimum increment needed when adding a new variable into
#' the balance in order to consider an improvement.
#' @param covar \code{data.frame} with the variables to adjust for
#' (\emph{columns}).
#' @param col \code{vector} of two colours for differentiate the variables
#' appearing in the numerator and in the denominator of the balances.
#' @param col2 \code{vector} of three colours for the lines of the barplot with
#' the aim of identifying if each variable appears in the Global balance, in the
#' CV - balance or in both of them.
#' @param logit.acc when \code{y} is dichotomous, the measure to compute for
#' the correlation between \code{y} and the proposed \emph{balance}
#' adjusting for covariates. One of the following values: \code{"AUC"} (default),
#' \code{"Dev"}, \code{"Rsq"} or \code{"Tjur"}.
#' @param maxV \code{numeric} value defining the maximum number of variables
#' composing the balance. Default 1e10 to give prevalence to \code{th.imp}
#' parameter.
#' @param zero.rep a value defining the method to use for zero - replacement.
#' \code{"bayes"} for BM-replacement or \code{"one"} to add one read tho each
#' cell of the matrix.
#' @param opt.cri parameter indicating the method to determine the optimal
#' number of variables. \code{"max"} to define this number as the number of
#' variables which maximizes the association value or \code{"1se"} to take also
#' the standard error into account.
#' @param user_numVar parameter to modify the choosen optimal number of variables.
#' If it is used, it is the final number of variables used in the method.
#'
#'
#' @return A \code{list} with the following objects:
#'
#' \itemize{
#' \item a boxplot with the mean squared errors (numeric responses) or AUC
#' values (dichotomous responses) for the test data sets using the balances
#' resulted in the cross - validation. Branches represent the standard error and
#' the optimal number of components according with the \code{opt.cri} criteria
#' is highlighted with a dashed line.
#' \item barplot with the proportion of times a variable appears in the
#' cross - validation balances.
#' \item a graphical representation of the Global Balance (draw it using
#' \code{grid.draw} function).
#' \item a table with the infromation of Global Balance, CV Balance and the
#' three most repeated balances in the cross - validation process (draw it using
#' \code{plot.tab} function).
#' \item a vector with the accuracy values (MSE for continuous variables and
#' AUC for dichotomous variables) obtained in the cross - validation procedure.
#' \item a table with the variables appearing in the Global Balance in a useful
#' format for \code{bal.value} function in order to get the balance score for
#' new datasets.
#' \item the regression model object where the covariates and the final balance
#' are the explanatory variables and \code{y} the response variable.
#' \item the optimal number of variables estimated in the cross -
#' validation.
#'
#' }
#'
#' @examples
#' # Load data set
#' load("HIV.rda")
#' # Define x and y
#' x <- HIV[,1:60]
#' y <- HIV[,62]
#' # Run the algorithm
#' CV.Bal <- selbal.cv(x,y)
#' @export selbal.cv
selbal.cv <- function(x, y, n.fold = 5, n.iter = 10, seed = 31415,
covar = NULL, col = c("steelblue1", "tomato1"),
col2 = c("darkgreen", "steelblue4","tan1"),
logit.acc = "AUC", maxV = 20, zero.rep = "bayes",
opt.cri = "1se", user_numVar = NULL){
# Load package plyr
suppressMessages(library(plyr))
#------------------------------------------------------------------------------#
#----------------------------------------------------------------------------#
# STEP 0: build the necessary objects
#----------------------------------------------------------------------------#
#-----------------------------------------------#
# 0.1: Build necessary objects for the function
#-----------------------------------------------#
# Class of the response
classy <- class(y)
# Family for the glm (default: gaussian)
f.class <- "gaussian"
# numy to be y and it will be modified if y is a factor
numy <- y
# If y is a factor, compute the number of levels of y
if (classy == "factor"){
ylev <- levels(y)
numy <- as.numeric(y) - 1
f.class <- "binomial"
}
# Names of x
x.nam <- colnames(x)
# ROB.TAB
ROB.TAB <- matrix(0, nrow = ncol(x), ncol=3)
colnames(ROB.TAB) <- c("Prop. Included", "Prop_Numerator",
"Prop_Denominator")
row.names(ROB.TAB) <- x.nam
# BAL.resume
BAL.resume <- matrix(0,nrow =n.fold*n.iter , ncol = ncol(x))
colnames(BAL.resume) <- colnames(x)
# Build a table with the response variable and covariates for correction
if (!is.null(covar)){ dat <- data.frame(cbind(numy, covar))
} else { dat <-data.frame(numy)}
# Message starting the algorithm
cat(paste("\n\n###############################################################",
"\n STARTING selbal.cv FUNCTION",
"\n###############################################################"))
# Log-transformed counts (with zero replacement
cat(paste(
"\n\n#-------------------------------------------------------------#",
"\n# ZERO REPLACEMENT . . .\n\n"))
# Define log-transformed data with the zero-replacement made
logc <- log(cmultRepl2(x, zero.rep = zero.rep))
cat(paste("\n, . . . FINISHED.",
"\n#-------------------------------------------------------------#"))
#---------------------------------------#
# 0.2: Define cross - validation groups
#---------------------------------------#
# CROSS - VALIDATION groups
# Load library
suppressMessages(library(CMA))
# Fix a seed
set.seed(seed)
# Matrix where rows represent the individuals taking part for the
# learningsets
CV.groups<-GenerateLearningsets(y = y, fold = n.fold, niter = n.iter,
method = "CV",
strat = ifelse(class(y)!="factor",
F,T))@learnmatrix
# Unload CMA package not to have issues with pROC package
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
# Function for cross - validation
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
cv.MSE <- function(k){
#-------------------------------------#
# Necessary matrices for the function
#-------------------------------------#
# Folds associated to a "complete FOLD"
CV.group<-CV.groups[((k-1)*n.fold + 1):(k*n.fold),]
# Build objects
# Table with all the variables included
Bal.List <- list()
# Matrix for MSE values (or ACC "Accuracy")
ACC.Mat <- matrix(0,nrow = maxV-1, ncol = nrow(CV.group))
# For each fold
for (i in 1:nrow(CV.group)){
# Define training data.set
train.idx<-CV.group[i,]
# Training dataset (x, y and covar)
x.train<-logc[train.idx,]
x.test<-logc[-train.idx,]
y.train<-y[train.idx]
y.test<-y[-train.idx]
covar.train<-covar[train.idx,]
covar.test<-covar[-train.idx,]
# Compute the balances for the training data set
BAL <- selbal.aux(x.train, y.train, th.imp = 0, covar = covar.train,
logit.acc, logt=F, maxV = maxV)
# Variables included in the balance (as NUMERATOR | DENOMINATOR)
Bal.List[[i]]<-BAL
# A matrix for predictions for Y
PRED.y <- matrix(0, nrow = length(y.test), ncol = nrow(BAL) - 1)
# For each number of variables (2:nrow(BAL))
for (l in 2:min(maxV,nrow(BAL))){
# Data frame for train data
df <-data.frame(Y = y.train, B = bal.value(BAL[1:l,],x.train))
# Data frame for test data
df.test <- data.frame(Y = y.test, B = bal.value(BAL[1:l,],x.test))
if(!is.null(covar)){
df <- cbind(df,cov=covar.train)
df.test <- cbind(df.test,cov=covar.test)
}
# Regression model for test data
FIT <- glm(Y ~ ., data= df, family = f.class)
# Predictions
PRED.y[,l-1] <- predict(FIT,df.test, type="response")
}
#------------------------------------------------------------------------------#
# FUNCTION: Measure the error value
#------------------------------------------------------------------------------#
ACC.eval <- function(y, pred, classy, logit.acc=NULL){
if (classy == "numeric"){
ACC <- apply(pred, 2, function(x) mean((y-x)^2))
}else{
if (logit.acc == "AUC"){
# Load library
library(pROC)
ACC <- apply(pred, 2, function(x) auc(y,x, quiet=TRUE))
} else if(logit.acc == "Rsq"){
ACC <- apply(pred, 2, function(x) cor (y, x)^2)
} else if (logit.acc == "Tjur"){
ACC <- apply(pred, 2, function(x) mean(x[y==1]) - mean(x[y==0]))
} else if (logit.acc == "Dev"){
ACC<-apply(pred, 2, function(x) 1-(deviance(glm(y ~ x, data= df, family = binomial()))/glm(y~1, family=binomial())[[10]]) ) # proportion of explained deviance
}
}
return(ACC)
}
#------------------------------------------------------------------------------#
# Run ACC.eval function
R <- ACC.eval(numy[-train.idx], PRED.y, classy=classy, logit.acc)
# Add the information
ACC.Mat[,i] <- c(R, rep(R[length(R)], maxV-length(R)-1))
} # End of i
return(list(Bal.List, ACC.Mat))
}
#------------------------------------------------------------------------------#
################################################################################
cat(paste("\n\n#-------------------------------------------------------------#",
"\n# Starting the cross - validation procedure . . ."))
# Build a parallelization scenario
suppressMessages(library(foreach))
suppressMessages(library(doParallel))
# Number of cores of the computer but one
no_cores <- detectCores() - 2
# Register the number of cores
registerDoParallel(no_cores)
# Define the function comb
comb <- function(x, ...) {
lapply(seq_along(x),
function(i) c(x[[i]], lapply(list(...), function(y) y[[i]])))
}
################################################################################
# CV - procedure computed in parallel
INTEREST <- foreach(h=1:n.iter,
.export=c("logit.cor", "rowM","selbal.aux", "bal.value",
"logit.acc", "cmultRepl","cmultRepl2"),
.combine='comb',
.multicombine=TRUE,
.init=list(list(), list())) %dopar% {
cv.MSE(h)
}
# Stop the parallelization
stopImplicitCluster()
cat(paste("\n . . . finished.",
"\n#-------------------------------------------------------------#",
"\n###############################################################"))
#------------------------------------------------------------------------------#
# Rebuild the objects from INTEREST
Balances <- unlist(INTEREST[[1]], recursive = F)
ACC.Matrix <- do.call(cbind,INTEREST[[2]])
# ACC mean values for each number of variables
ACC.mean <- apply(ACC.Matrix,1,mean)
ACC.se <- apply(ACC.Matrix,1,function(x) sd(x)/sqrt(length(x)))
if(classy == "numeric"){
# Define the minimum mean value
m <- which.min(ACC.mean)
# The minimum value under ACC.mean[m] + SE
if(length(which((ACC.mean<(ACC.mean[m] + ACC.se[m]))==T))>0){
mv <- min(which((ACC.mean<(ACC.mean[m] + ACC.se[m]))==T))
} else {mv<-m}
# Depending on "opt.cri":
if (opt.cri == "1se"){opt.M <- mv + 1
}else {opt.M <- m + 1}
}else{
# Define the maximum ACC value
m <- which.max(ACC.mean)
# The minimum value whith ACC.mean over ACC.mean[m] - ACC.sd[m]
if(length(which((ACC.mean>(ACC.mean[m] - ACC.se[m]))==T))>0){
mv <- min(which((ACC.mean>(ACC.mean[m] - ACC.se[m]))==T))
} else {mv<-m}
# Depending on "opt.cri":
if (opt.cri == "1se"){opt.M <- mv + 1
}else { opt.M <- m + 1}
}
# Print a message indicating the number of optimal variables
cat(paste("\n\n The optimal number of variables is:", opt.M, "\n\n"))
if (!is.null(user_numVar)){
opt.M <- user_numVar;
}
# Define NUM and DEN according to opt.M
suppressMessages(BAL <- selbal.aux(x, y, th.imp = 0, covar = covar,
logit.acc, logt=T, maxV = opt.M)) #diferencies logit.acc=logit.acc
# Variables in the NUMERATOR and the DENOMINATOR
NUM <- BAL[BAL[,2]=="NUM","Taxa"]
DEN <- BAL[BAL[,2]=="DEN","Taxa"]
# Information about the GLM
# Final balance (number of components)
k1 <- length(NUM); k2 <- length(DEN)
# The final Balance
FINAL.BAL <- sqrt((k1*k2)/(k1+k2))*(rowM(logc[,NUM])- rowM(logc[,DEN]))
# Auxiliar data.frame for graphical representation
U <- data.frame(dat, FINAL.BAL)
colnames(U)[ncol(U)] <- "V1"
# Regression model
FIT.final <- glm(numy~., data=U, family = f.class)
#------------------------------------------------------------------------------#
# GRAPHICAL REPRESENTATION
#------------------------------------------------------------------------------#
# Build a data.frame with the information
if(classy=="numeric"){
df.boxplot <- data.frame(mean = ACC.mean, se = ACC.se, n =2:maxV)
# Load library
library(ggplot2)
# The plot
MSE.Boxplot <- ggplot(df.boxplot, aes(x=n, y=mean)) +
geom_errorbar(aes(ymin=mean - se, ymax= mean + se),
width = 0.2, col = "gray") +
geom_vline(xintercept = opt.M, linetype = "dotted",
col = "blue") +
geom_point(color = "red", lwd=2) +
theme_bw() +
xlab("Number of variables") +
ylab("Mean-Squared Error") +
scale_x_continuous(breaks=seq(2,maxV,1)) +
theme(strip.text.x = element_text(size=12, angle=0,
face="bold",colour="white"),
strip.text.y = element_text(size=12, face="bold"),
strip.background = element_rect(colour="black",
fill="black"),
plot.title = element_text(size=20, vjust=2.25, hjust=0.5,
face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
}else{
df.boxplot <- data.frame(mean = ACC.mean, se = ACC.se, n =2:maxV)
ylabelName="Accuracy (AUC)";
if (logit.acc=="Dev"){
ylabelName="Explained Deviance";
}
# Load library
library(ggplot2)
# The plot
MSE.Boxplot <- ggplot(df.boxplot, aes(x=n, y=mean)) +
geom_errorbar(aes(ymin=mean - se, ymax= mean + se),
width = 0.2, col = "gray") +
geom_vline(xintercept = opt.M, linetype = "dotted",
col = "blue") +
geom_point(color = "red", lwd=2) +
theme_bw() +
xlab("Number of variables") +
ylab(ylabelName) +
scale_x_continuous(breaks=seq(2,maxV,1)) +
theme(strip.text.x = element_text(size=12, angle=0,
face="bold",colour="white"),
strip.text.y = element_text(size=12, face="bold"),
strip.background = element_rect(colour="black",
fill="black"),
plot.title = element_text(size=20, vjust=2.25, hjust=0.5,
face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
}
# Extract information about the variables selected in the balances
Sub.Balances <- lapply(Balances, function(x) x[1:min(nrow(x),opt.M),])
# Complete the matrix with the variables appearing
# Build the matrix
BR<-matrix(0,nrow =n.fold*n.iter , ncol = length(x.nam))
colnames(BR) <- x.nam
# Complete the row of BAL.resume
for (i in 1:length(Sub.Balances)){
BR[ i, colnames(BR) %in% Sub.Balances[[i]][Sub.Balances[[i]][,"Group"]=="NUM","Taxa"]] <- 1
BR[ i, colnames(BR) %in% Sub.Balances[[i]][Sub.Balances[[i]][,"Group"]=="DEN","Taxa"]] <- 2
}
# Complete the table
ROB.TAB[,1] <- apply(BR!=0,2,function(x) 100*mean(x))
ROB.TAB[,2] <- apply(BR==1,2,function(x) 100*mean(x))
ROB.TAB[,3] <- apply(BR==2,2,function(x) 100*mean(x))
# Variables with at least included in a balance
fil1 <- which(ROB.TAB[,1]!=0)
ord.ROB.TAB <- ROB.TAB[fil1,]
# Order by the first row
sel.ord <- order(ord.ROB.TAB[,1], decreasing = F)
ord.ROB.TAB <- ord.ROB.TAB[sel.ord,]
# Define a data.frame to plot the results
BAL.SEL.TAB <- data.frame(name = row.names(ord.ROB.TAB),
sel = ord.ROB.TAB[,1])
# Define the levels of the $name
BAL.SEL.TAB$name <- factor(BAL.SEL.TAB$name, levels = BAL.SEL.TAB$name)
# Define the color for variables in the numerator (as overall)
COLOR.BAL <- rep(col[2],nrow(BAL.SEL.TAB))
# If the variable is in the denominator modify the color:
# Variables in the denominator
vDEN <- row.names(ord.ROB.TAB)[which(ord.ROB.TAB[,"Prop_Denominator"]!=0)]
COLOR.BAL[row.names(BAL.SEL.TAB) %in% vDEN] <- col[1]
# Add COLOR.BAL to BAL.SEL.TAB
BAL.SEL.TAB$COLOR.BAL <- factor(COLOR.BAL, levels = col, labels = col)
#------------------------------------------#
# Barplot representation
#------------------------------------------#
# Load library ggplot2
suppressMessages(library(ggplot2))
# IMP.plot
IMP.plot <- ggplot(BAL.SEL.TAB, aes(x=factor(name), y=sel)) +
geom_bar(stat="identity", aes(fill = COLOR.BAL),
size=1) +
guides(size = FALSE) + # Not to show the legend of the size
scale_fill_manual(name = "Group of . . .",
values = c(col[1], col[2]),
breaks = c(col[1], col[2]),
labels=c("DEN","NUM")) +
scale_color_manual(name ="Variables \n appearing in . . .",
values = c(col2,"white"),
breaks = c(col2,"white"),
labels = c("Both", "Global", "CV" ,"NONE"),
drop=F,
guide=guide_legend(
override.aes = list(fill="gray90"))) +
ylab("% of times included in a balance") +
xlab("") + theme_bw() +
coord_flip() +
ggtitle("Cross validation in balance selection") +
theme(strip.text.x = element_text(size=12, angle=0,
face="bold",colour="white"),
strip.text.y = element_text(size=12, face="bold"),
strip.background = element_rect(colour="black",
fill="black"),
plot.title = element_text(size=20, vjust=2.25, hjust=0.5,
face = "bold"),
legend.title = element_text(face="bold"),
legend.text = element_text(face="bold"))
#-----------------------------------#
# Most repeated balances
#-----------------------------------#
# Balances' strings
BAL.str <- apply(BR,1, function(x) paste(x, collapse=""))
# Resume the information
BAL.tab <- prop.table(table(BAL.str))
# Names of appearing balances
nam.str <- names(BAL.tab)
# Values
nam.A <- t(sapply(nam.str, FUN = function(x) unlist(strsplit(x,""))))
# Variables included in the most abundant balances
INC <- apply(nam.A, 1, function(x) x.nam[x!=0])
# Variables included in the numerator of each selected balance
INC.NUM <- alply(nam.A, 1, function(x) x.nam[x==1])
# Variables included in the denominator of each selected balance
INC.DEN <- alply(nam.A, 1, function(x) x.nam[x==2])
# Variables selected
UNIQUE.VAR <- unique(c(as.vector(unlist(INC)),NUM, DEN))
# Build a data.frame to represent
RESUME.BAL <- as.data.frame(matrix(0, nrow =length(UNIQUE.VAR),
ncol = length(BAL.tab)))
row.names(RESUME.BAL) <- UNIQUE.VAR
# Put "NUM" if the variable is in the numerator of the balance
RESUME.BAL[sapply(INC.NUM, function(x) UNIQUE.VAR %in% x)] <- "NUM"
# Put "DEN" if the variable is in the denominator of the balance
RESUME.BAL[sapply(INC.DEN, function(x) UNIQUE.VAR %in% x)] <- "DEN"
# Add the relative frequency of the balances
RESUME.BAL <- rbind(RESUME.BAL, FREQ=as.numeric(BAL.tab))
# Add two new columns (one for Global Balance and another for percentages)
RESUME.BAL <- cbind(RESUME.BAL, 0, 0)
# Add the information of the global balance
RESUME.BAL[row.names(RESUME.BAL)%in%NUM,ncol(RESUME.BAL)] <- "NUM"
RESUME.BAL[row.names(RESUME.BAL)%in%DEN,ncol(RESUME.BAL)] <- "DEN"
# NEW
RESUME.BAL[-nrow(RESUME.BAL) ,ncol(RESUME.BAL) - 1] <-
ROB.TAB[row.names(RESUME.BAL)[-nrow(RESUME.BAL)],1]
# Order RESUME.BAL by FREQ
RESUME.BAL <- RESUME.BAL[,c(ncol(RESUME.BAL), ncol(RESUME.BAL)-1,
order(as.numeric(RESUME.BAL[nrow(RESUME.BAL),
-c(ncol(RESUME.BAL),
ncol(RESUME.BAL)-1)],
decreasing = T)))]
# No frequency for the Global balance and the CV.Balance
RESUME.BAL[nrow(RESUME.BAL),1:2]<- "-"
# Data to plot (maximum 5 different balances)
dat <- RESUME.BAL[,c(1,2:(min(5,ncol(RESUME.BAL))))]
W <- which(apply(dat[,-2]==0,1,mean)==1)
if(length(W)!=0){ dat <- dat[-as.numeric(W),]}
# Change the order of first and second colum
dat <- dat[,c(2,1,3:ncol(dat))]
colnames(dat)[1:2]<-c("%","Global")
# Order dat (rows ordered by their presence percentage)
dat<-dat[c(order(as.numeric(dat[-nrow(dat),1]),decreasing=T),nrow(dat)),]
#------------------------------------------------------------------------------#
# GRAPHICAL REPRESENTATION OF MAIN BALANCES: global balance and cv - balance
#------------------------------------------------------------------------------#
# Define y for the plot
ifelse(classy %in% c("numeric","integer"),
y.plot <- y,
y.plot <- 1:length(y))
# PLOT GLOBAL
PLOT.Global <- plot.bal(NUM,DEN,logc,y, covar, col = col, logit.acc)
# Message starting the algorithm
cat(paste("\n\n###############################################################",
"\n . . . FINISHED.",
"\n###############################################################"))
# Build a list with the elements of interest
L <- list(accuracy.nvar = MSE.Boxplot,
var.barplot = IMP.plot,
global.plot = PLOT.Global$Global.plot,
global.plot2 = PLOT.Global$Global.plot2,
ROC.plot = PLOT.Global$ROC.plot,
cv.tab = dat,
cv.accuracy = ACC.Matrix[(opt.M - 1),],
global.balance = BAL,
glm = FIT.final,
opt.nvar = opt.M)
return(L)
}
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# FUNCTION: selbal.aux
# INPUT:
# x: matrix with counts
# y: response variable
# th.imp: threshold of improvement when adding a variable. If the
# improvement is not higher than this quantity, it stops.
# covar: data.frame with covariates
# logit.acc: the accuracy value to return when a logit regression is made
# logt: should data be log-transformed?
# maxV: number of maximum variables for the balance
# OUTPUT:
# Tab.var: a table including the variables included in the balance in the
# order they were added.
#------------------------------------------------------------------------------#
#' The balance defined in the order its variables were selected
#'
#'
#' @param x a \code{matrix} object with the information of variables
#' (\emph{columns}) for each sample (\emph{rows}).
#' @param y the response variable, either continuous or dichotomous.
#' @param th.imp the minimum increment needed when adding a new variable into
#' the balance, in order to consider an improvement.
#' @param covar \code{data.frame} with the variables to adjust for
#' (\emph{columns}).
#' @param logit.acc when \code{y} is dichotomous, the measure to compute for
#' the correlation between \code{y} and the proposed \emph{balance}
#' adjusting for covariates. One of the following values: \code{"Rsq"} (default),
#' \code{"AUC"} or \code{"Tjur"}.
#' @param logt a logical value indicating if the data needs to be
#' log-transformed.
#' @param maxV the number maximum of variables defining the balance.
#'
#' @return A \code{data.frame} with the following objects:
#'
#' \itemize{
#' \item \code{Taxa} the name of the taxa added into the balance.
#' \item \code{Group} \emph{NUM} if the variable is added into the numerator
#' and \emph{DEN} if it is added in the denominator.
#'
#' }
#'
#' @examples
#' # Load data set
#' load("HIV.rda")
#' # Define x and y
#' x <- HIV[,1:60]
#' y <- HIV[,62]
#' # Run the algorithm
#' Bal <- selbal.aux(x,y)
#' @export selbal.aux
selbal.aux <- function(x, y, th.imp = 0, covar = NULL, logit.acc="AUC",
logt=T, maxV = 1e10, zero.rep = "bayes"){
#--------------------------------------------------------------------------#
# STEP 0: load libraries and extract information
#--------------------------------------------------------------------------#
#----------------------------------------------#
# 0.1: information about the response variable
#----------------------------------------------#
# Class of the response variable
classy <- class(y)
# Family for the glm (default: gaussian)
f.class <- "gaussian"
# numy to be y and it will be modified if y is a factor
numy <- y
# If y is a factor, compute the number of levels of y
if (classy == "factor"){
ylev <- levels(y)
numy <- as.numeric(y) - 1
f.class <- "binomial"
}
#------------------------------------------------------------------#
# 0.2: information and transformation of the independent variables
#------------------------------------------------------------------#
# Load library
suppressMessages(library(zCompositions))
# Build a table with the response variable and covariates for correction
if (!is.null(covar)){ dat <- data.frame(cbind(numy, covar))
} else { dat <-data.frame(numy)}
# The logCounts (with zero replacement)
if (logt == F){ logCounts <- x
} else{
logCounts <- log(cmultRepl2(x, zero.rep = zero.rep))
}
# Variables name
var.nam <- rem.nam <- colnames(logCounts)
#--------------------------------------------------------------------------#
# 0.3: auxiliar functions
#--------------------------------------------------------------------------#
#--------------------------------------------------------------------------#
# Auxiliar function to compute the first balance
#--------------------------------------------------------------------------#
first.bal<- function(logCounts, Y, covar=NULL){
#------------------------------------------------------------------------#
# STEP 0: extract information
#------------------------------------------------------------------------#
# Number and name of variables
n <- ncol(logCounts)
nam <- colnames(logCounts)
#------------------------------------------------------------------------#
# STEP 1: build the output matrix
#------------------------------------------------------------------------#
# The matrix
if (classy=="factor"){ M<-matrix(0, nrow=n, ncol=n)
}else{ M<-matrix(1e99, nrow=n, ncol=n)}
# Row.names and colnames
row.names(M)<-colnames(M)<-nam
#------------------------------------------------------------------------#
# STEP 2: complete the matrix
#------------------------------------------------------------------------#
if(classy=="factor"){
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
for (i in 2:n){
for (j in 1:(i-1)){
# Build a table with the information
TAB <- data.frame(cbind(Y,logCounts[,i]-logCounts[,j],covar))
# Fit the regression model
FIT <- glm(Y ~ .,data=TAB, family = f.class)
# Add the value into the matrix
ifelse(FIT$coefficients[2]>0,
M[i,j] <- logit.cor(FIT, y = y, covar = covar, logit.acc),#diferencies Y covar
M[j,i] <- logit.cor(FIT, y = y, covar = covar, logit.acc)) #diferencies Y covar
} # End j
} # End i
# Indices for the highest logit.cor value
r <- which(M == max(M), arr.ind = T)
} else {
for (i in 2:n){
for (j in 1:(i-1)){
# Build a table with the information
TAB <- data.frame(cbind(Y,logCounts[,i]-logCounts[,j],covar))
# Fit the regression model
FIT <- glm(Y ~ .,data=TAB, family = f.class)
# Complete the matrix
ifelse(FIT$coefficients[2]>0,
M[i,j] <- mean(FIT$residuals^2),
M[j,i] <- mean(FIT$residuals^2))
} # End j
} # End i
# Indices for the lowest MSE value
r <- which(M == min(M), arr.ind = T)
}
# Return the row and column of the maximum value
return(r)
}
#--------------------------------------------------------------------------#
#--------------------------------------------------------------------------#
# Auxiliar function to compute the "association value" when adding a new
# variable into the balance
#--------------------------------------------------------------------------#
balance.adding.cor <- function(x, LogCounts, POS, NEG, numy, covar=NULL){
#----------------------------------------#
# If x added into the numerator, . .
#----------------------------------------#
# The "numerator"
S1.pos <- rowM(LogCounts[,c(POS,x)]); s1 <- length(POS) + 1
# The "denominator"
S2.pos <- rowM(LogCounts[,NEG]) ; s2 <- length(NEG)
# The balance
BAL <- sqrt((s1*s2)/(s1+s2))*(S1.pos - S2.pos)
# Data.frame with the variables
D.pos <- data.frame(cbind(numy, BAL, covar))
# Regression model
FIT.pos <- glm(numy~., data=D.pos, family=f.class)
# The MSE or the corresponding value for dichotomous responses
if(classy=="numeric"){ C.pos <- mean(FIT.pos$residuals^2)
#}else{ C.pos <- logit.cor(FIT.pos,numy,covar = covar, logit.acc)}#diferencies covar
}else{ C.pos <- logit.cor(FIT.pos,y,covar = covar, logit.acc)}#diferencies covar
#----------------------------------------#
# If x added into the numerator, . .
#----------------------------------------#
# The numerator
S1.neg <- rowM(LogCounts[,POS]) ; s1 <- length(POS)
# The denominator
S2.neg <- rowM(LogCounts[,c(NEG,x)]) ; s2 <- length(NEG) + 1
# The balance
BAL <- sqrt((s1*s2)/(s1+s2))*(S1.neg - S2.neg)
# Data.frame with the variables
D.neg <- data.frame(cbind(numy, BAL, covar))
# Regression model
FIT.neg <- glm(numy~., data=D.neg, family=f.class)
# The MSE or the corresponding value for dichotomous responses
if(classy=="numeric"){ C.neg <- mean(FIT.neg$residuals^2)
#}else{ C.neg <- logit.cor(FIT.neg,numy,covar = covar, logit.acc)} #diferencies covar
}else{ C.neg <- logit.cor(FIT.neg,y,covar = covar, logit.acc)} #diferencies covar
# Correlation values
COR <- c(C.pos, C.neg)
# Return the values
return(COR)
}
#------------------------------------------------------------------------------#
#--------------------------------------------------------------------------#
# STEP 1: depending on the response variable class, . . .
#--------------------------------------------------------------------------#
# Define the first balance
A1 <- first.bal(logCounts, Y = numy, covar=covar)
# Variables taking part into the first balance
POS <- colnames(logCounts)[A1[1,1]]
NEG <- colnames(logCounts)[A1[1,2]]
# Included variables in the model
INC.VAR <- c(POS, NEG)
# Delete these variables from rem.nam
rem.nam <- setdiff(rem.nam, INC.VAR)
# Define the initial balance (B_1)
S1 <- logCounts[,POS]
S2 <- logCounts[,NEG]
# Assign the values to B
B <- sqrt(1/2)*(S1 - S2)
#--------------------------------------------------------------------------#
# Information about the ACC for the Balance values
#--------------------------------------------------------------------------#
#--------------------------------------------------------------------------#
# NEW: A table with the included variables and the group
#--------------------------------------------------------------------------#
Tab.var<- data.frame(Taxa = c(POS,NEG), Group = c("NUM", "DEN"))
Tab.var[,1]<-as.character(Tab.var[,1])
# Build a new data.frame
dat.ini <- cbind(dat, B)
# Fit the regression model
FIT.initial <- glm(numy ~ .,data=dat.ini, family = f.class)
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
# Define the initial "accuracy" or "association" value
if(classy=="numeric"){ ACC.Bal <- mean(FIT.initial$residuals^2)
#}else{ ACC.Bal <- logit.cor(FIT.initial, numy, covar = covar, logit.acc)}#diferencies covar
}else{ ACC.Bal <- logit.cor(FIT.initial, y, covar = covar, logit.acc)}#diferencies covar
#----------------------------------------------------------------------------#
# ACC reference
ACC.ref <- ACC.Bal
#------------------------------------------------------------------------#
# Improve the balances
#------------------------------------------------------------------------#
# Define some parameters
# The p.value to compare 2 balances (one of them with an additional
# variable)
ACC.set <- ACC.ref
# Index of the number of variables for the balance
nV <- 2
#------------------------------#
# For numeric responses, . . .
#------------------------------#
if (classy=="numeric"){
# While there is an improvement and the maximun number of variables has not
# been reached, . . .
while (ACC.set <= ACC.ref && length(rem.nam)!=0 && nV<maxV){
# The new p.bal.ref is the p.set of the previous step
ACC.ref <- ACC.set
# Function to extract the p-value
add2bal.ACC <- matrix(,nrow = 0, ncol = 2)
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
# Extract the p-values
add2bal.ACC <- t(sapply(rem.nam, function(x)
balance.adding.cor(x, LogCounts = logCounts, POS, NEG, numy = numy,
covar = covar)))
# Add names to the rows
row.names(add2bal.ACC) <- rem.nam
# Extract which is the variable (only the first row)
ACC.opt <- which(add2bal.ACC==min(add2bal.ACC),arr.ind = T)[1,]
# Modify p.set
ACC.set <- min(add2bal.ACC)
# If there is an improvement, . . .
#if (abs(ACC.set - ACC.ref) > th.imp){
if ((ACC.set - ACC.ref) < th.imp){
INC.VAR <- c(INC.VAR, rem.nam[ACC.opt[1]])
nV <- nV + 1
if (ACC.opt[2]==1){
POS <- c(POS, rem.nam[ACC.opt[1]])
Tab.var <- rbind(Tab.var, c(rem.nam[ACC.opt[1]], "NUM"))
} else if (ACC.opt[2]==2){
NEG <- c(NEG, rem.nam[ACC.opt[1]])
Tab.var <- rbind(Tab.var, c(rem.nam[ACC.opt[1]], "DEN"))
} else {ACC.set <- 0 }
# Remainng variables (possible to add to the balance)
rem.nam <- rem.nam[-ACC.opt[1]]
}
} # End while
}else{
#-----------------------------------#
# For non-numeric responses, . . .
#-----------------------------------#
# While there is an improvement and the maximun number of variables has not
# been reached, . . .
while (ACC.set >= ACC.ref && length(rem.nam)!=0 && nV<maxV){
# The new p.bal.ref is the p.set of the previous step
ACC.ref <- ACC.set
# Function to extract the p-value
add2bal.ACC <- matrix(,nrow = 0, ncol = 2)
# Solve the problem with libraries
suppressWarnings(suppressMessages(library("CMA")))
suppressMessages(detach("package:CMA", unload=TRUE))
suppressMessages(library(pROC))
# Extract the p-values
add2bal.ACC <- t(sapply(rem.nam, function(x)
balance.adding.cor(x, LogCounts = logCounts, POS, NEG, numy = numy,
covar = covar)))
# Add names to the rows
row.names(add2bal.ACC) <- rem.nam
# Extract which is the variable (only the first row)
ACC.opt <- which(add2bal.ACC==max(add2bal.ACC),arr.ind = T)[1,]
# Modify p.set
ACC.set <- max(add2bal.ACC)
# If there is an improvement, . . .
if ((ACC.set - ACC.ref) > th.imp){
INC.VAR <- c(INC.VAR, rem.nam[ACC.opt[1]])
nV <- nV + 1
if (ACC.opt[2]==1){
POS <- c(POS, rem.nam[ACC.opt[1]])
Tab.var <- rbind(Tab.var, c(rem.nam[ACC.opt[1]], "NUM"))
} else if (ACC.opt[2]==2){
NEG <- c(NEG, rem.nam[ACC.opt[1]])
Tab.var <- rbind(Tab.var, c(rem.nam[ACC.opt[1]], "DEN"))
} else {ACC.set <- 0 }
} # End if
# Remainng variables (possible to add to the balance)
rem.nam <- rem.nam[-ACC.opt[1]]
}
}
# K1 and k2
k1 <- length(POS); k2 <- length(NEG)
# The final Balance
FINAL.BAL <- sqrt((k1*k2)/(k1+k2))*
(rowM(logCounts[,POS])- rowM(logCounts[,NEG]))
return(Tab.var)
}
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# FUNCTION: bal.value
# INPUT:
#
# bal.tab: a table defining a balance as in the output of selbal.aux
# x: matrix with counts
# OUTPUT:
# Tab.var: a table including the variables included in the balance in the
# order they were added.
#------------------------------------------------------------------------------#
#' The balance value for a given matrix of counts using the variables appearing
#' in a \code{data.frame} like the output of \code{selbal.aux}
#'
#'
#' @param bal.tab a \code{data.frame} including the variables defining the
#' balance (like the output of \code{selbal.aux} or the sixth element in the
#' output of \code{selbal.cv} function).
#' @param x the matrix with the log-transformed counts for a given subset of
#' individuals.
#'
#' @return A \code{vector} with the balance values for each subject.
#'
#'
#' @examples
#' # Load data set
#' load("HIV.rda")
#' # Define x and y
#' x <- HIV[,1:60]
#' y <- HIV[,62]
#' # Run the algorithm
#' Bal <- selbal.aux(x,y)
#' # Balance values for the individuals (log-transformed x values with the corresponding zero-replacement)
#' bal.value(Bal,log(cmultrepl2(x)))
#'
#' @export bal.value
bal.value <- function(bal.tab, x){
# Variables in the numerator and the denominator
vNUM <- bal.tab[bal.tab[,2]=="NUM",1]; k1 <- length(vNUM)
vDEN <- bal.tab[bal.tab[,2]=="DEN",1]; k2 <- length(vDEN)
# Value for the balance
bal <- sqrt(k1*k2/(k1+k2))*(rowM(x[,vNUM]) - rowM(x[,vDEN]))
# Return the value
return(bal)
}
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# FUNCTION: plot.bal
# INPUT:
#
# POS: variables taking part in the numerator of the balance
# NEG: variables taking part in the denominator of the balance
# x: matrix with counts
# y: response variable
# col: vector with two colours, each one for a level when dichotomous
# factor.
# OUTPUT:
# A graphical representation of the given balance with the corresponding
# information.
#
#------------------------------------------------------------------------------#
#' The balance value for a given matrix of counts using the variables appearing
#' in a data.frame like the output of selbal.aux
#'
#'
#' @param POS a vector containing the variables taking part in the numerator of
#' the balance.
#' @param NEG a vector containing the variables taking part in the denominator
#' of the balance.
#' @param x a \code{matrix} object with the log-transforemd counts
#' @param y the response variable, either continuous or dichotomous.
#' @param covar a \code{data.frame} with the covariates to adjust for.
#' @param col a vector with two values for representing the individuals for
#' each of the two considered groups.
#' @param logit.acc when \code{y} is dichotomous, the measure to compute for
#' the correlation between \code{y} and the proposed \emph{balance}
#' adjusting for covariates. One of the following values: \code{"AUC"} (default),
#' \code{"Dev"}, \code{"Rsq"} or \code{"Tjur"}.
#'
#' @return A graphical representation of the balance for the selected
#' individuals and the variables taking part on it.
#'
#'
#' @examples
#' # Load data set
#' load("HIV.rda")
#' # Define x and y
#' x <- HIV[,1:60]
#' y <- HIV[,61]
#' # Run the algorithm
#' Bal <- selbal.aux(x,y)
#' # Balance values for the individuals
#' POS <- Bal[Bal[,2]=="NUM",1]
#' NEG <- Bal[Bal[,2]=="DEN",1]
#' # Log-transformed x for the representation
#' logx <- log(cmultRepl(x))
#' A <- plot.bal(POS,NEG,logx,y,classy="factor",col=c("red","blue"),
#' logit.acc="AUC)
#' # The plot
#' grid.draw(A)
#'
#' @export plot.bal
plot.bal <- function(POS, NEG, x, y, covar=NULL, col, logit.acc=NULL){
# Class of the response
classy <- class(y)
# Family for the glm (default: gaussian)
f.class <- "gaussian"
# numy to be y and it will be modified if y is a factor
numy <- y
# If y is a factor, compute the number of levels of y
if (classy == "factor"){
ylev <- levels(y)
numy <- as.numeric(y) - 1
f.class <- "binomial"
}
#--------------------------------------------------------------------------#
# STEP 1: represent the "tree" with the variables included
#--------------------------------------------------------------------------#
# Variables included
T1 <- c("NUMERATOR", POS)
T2 <- c("DENOMINATOR",NEG)
# Parameter to specify the limits for writting
yl <- max(length(T1), length(T2)) + .5
# Empty plot 1 with text
df <- data.frame()
Imp.table <- ggplot(df) + xlim(0, 100) + ylim(-0.5, yl) + theme_void() +
annotate("text",
x = 75,
y = floor(yl):ceiling(yl-length(T1)),
label = T1,
colour = c("royalblue1",rep("black",length(T1)-1)),
fontface = 2) +
annotate("text",
x = 25,
y = floor(yl):ceiling(yl-length(T2)),
label = T2,
colour = c("royalblue1",rep("black",length(T2)-1)),
fontface = 2)
# Parameter 2 to specify the limits for writting
T1 <- c(POS,"A")
T2 <- c(NEG,"B")
yl2 <- max(length(T1), length(T2));
yl2 <- yl2*1.05;
escal <- 3;
bot <- 5*escal*yl2/100;
# Empty plot 2 with text
ndiv = max(3,floor(yl2))+1;
lineh <- 0 # 0.5*ceiling(yl-length(T1));
df2 <- data.frame()
colbalance<-"brown3"
Imp.table2 <- ggplot(df2) + xlim(0, 100) + ylim(-bot, ndiv) + theme_void() +
geom_segment(aes(x = 10, y = lineh, xend = 90, yend = lineh),color=colbalance, size=1.3) +
geom_segment(aes(x = 50-escal, y = lineh-bot, xend = 50+escal, yend = lineh-bot),color=colbalance, size=1) +
geom_segment(aes(x = 50+escal, y = lineh-bot, xend = 50+escal, yend = lineh),color=colbalance, size=1) +
geom_segment(aes(x = 50-escal, y = lineh-bot, xend = 50-escal, yend = lineh),color=colbalance, size=1) +
annotate("text",
x = 75,
y = seq(bot, floor(yl2), length.out = ndiv),
label = c(T1,rep("",ndiv-length(T1))),
colour = c(rep("black",length(T1)-1),rep(colbalance,ndiv-length(T1)+1)),
fontface = 2) +
annotate("text",
x = 25,
y = seq(bot, floor(yl2), length.out = ndiv),
label = c(T2,rep("",ndiv-length(T2))),
colour = c(rep("black",length(T2)-1),rep(colbalance,ndiv-length(T2)+1)),
fontface = 2)
#--------------------------------------------------------------------------#
# STEP 2: the rest of the plots depending on the type of variable
#--------------------------------------------------------------------------#
# Specify FINAL.BAL
l1 <- length(POS); l2 <- length(NEG)
Coef <- sqrt((l1*l2)/(l1+l2))
# The final balance
FINAL.BAL <- Coef*(rowM(x[,POS]) - rowM(x[,NEG]))
# Auxiliar data.frame for graphical representation
U <- as.data.frame(cbind(numy, covar,FINAL.BAL))
# Factor
if(classy=="factor"){U$numy <- factor(U$numy, labels = ylev)}
colnames(U) <- c("numy", colnames(covar),"V1")
# Final regression model
FIT.final <- glm(numy~., data=U, family = f.class)
# Maximum and minimum values for y and V1 (when numeric)
if (class(y)%in%c("numeric","integer")){
a1<-min(U$V1); a2 <- max(U$V1)
b1<-min(U$numy); b2 <- max(U$numy)
}
# The plot depending of the class of the response variable
if (classy=="factor"){
#---------------------------------------------------#
# The composition of the final plot
#---------------------------------------------------#
# BOXPLOT 1
BoxP <- ggplot(U, aes(x=numy, y=V1, fill=y)) +
geom_boxplot(color="black", size=1) +
scale_fill_manual(values=col) +
theme_bw() +
ylab("Balance") +
xlab("Factor") +
theme(legend.position = "none")
# BOXPLOT 2
BoxP2 <- ggplot(U, aes(x=y, y=V1, fill=y)) +
geom_boxplot(color="black", size=1) +
scale_fill_manual(values=col) +
theme_bw() +
ylab("Balance") +
xlab("") +
theme(legend.position = "none")+coord_flip()
# Density plot 1 for the balance
ydensity <- ggplot(U, aes(V1, fill=y)) +
geom_density(alpha=.5, size=1.25) +
scale_fill_manual(values = col) +
theme_bw() + xlab("") + ylab("") +
theme(legend.position = "none",
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
coord_flip()
# Density plot 2 for the balance
ydensity2 <- ggplot(U, aes(V1, fill=y)) +
geom_density(alpha=.5, size=1.) +
scale_fill_manual(values = col) +
theme_bw() + xlab("") + ylab("") +
theme(legend.position = "none")
# ROC - curve
library(pROC)
# Build ROC curve
A<-roc(response = U$numy,predictor = FIT.final$fitted.values, quiet = TRUE)
# Extract the sensitivity and specificity value
ROC.TAB <- data.frame(x=1-A$specificities, y=A$sensitivities)
# Order them for a correct representation
ROC.TAB <- ROC.TAB[order(ROC.TAB$y),]
# AUC value
#auc.val <- round(logit.cor(FIT.final,y = U$numy, logit.acc = logit.acc),3)
#auc.val<-round(as.numeric(auc(y, FIT.final$fitted.values, quiet=TRUE)),3) #diferencies
if (class(y) == "factor") {numy<-as.numeric(y)-1} else {numy<-y} #new diferences
auc.val<-round(as.numeric(auc(numy, FIT.final$fitted.values, quiet=TRUE)),3) #new diferencies
# The plot
ROC.plot <- ggplot(data=ROC.TAB, aes(x=x, y=y)) +
geom_line() +
ggtitle("ROC curve") +
xlab("FPR") + ylab("TPR") +
geom_step() +
annotate("text", x = .5, y = .2,
label = paste("AUC-ROC \n",auc.val), size=3) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
# Load libraries
library("gridExtra")
library("grid")
FINAL.P <- arrangeGrob(Imp.table, ROC.plot, BoxP, ydensity,
ncol=2, nrow=2, widths=c(5,1), heights=c(2, 5),
vp=viewport(width=0.8, height=0.8))
library(gtable)
g1 <- ggplotGrob(Imp.table2)
g2 <- ggplotGrob(BoxP2)
g3 <- ggplotGrob(ydensity2)
g <- rbind(g1, g2, size = "first")
g <- rbind(g, g3, size = "first")
g$widths <- unit.pmax(g1$widths,g2$widths)
FINAL.P2 <- g
# Build a list with the elements of interest
L <- list(Global.plot = FINAL.P,Global.plot2 = FINAL.P2, ROC.plot = ROC.plot)
} else {
# Coefficient of determination
C.Det <- cor(FIT.final$fitted.values, y)^2
PLOT.G <- ggplot(U, aes(V1, y)) +
geom_point(colour = "black", size = 3) +
geom_abline(intercept=FIT.final$coefficients[1],
slope=FIT.final$coefficients[length(FIT.final$coefficients)], lwd=3, col="blue") +
theme_bw() +
xlab("Balance value") + ylab("Response variable") +
ggtitle("") +
annotate("text", x = a2 - .2*(a2-a1), y = b2 - .1*(b2-b1),
label = paste("italic(R) ^ 2 ==", round(C.Det,3)),
parse = TRUE) +
theme(strip.text.x = element_text(size=12, angle=0,
face="bold",colour="white"),
strip.text.y = element_text(size=12, face="bold"),
strip.background = element_rect(colour="black",
fill="black"),
plot.title = element_text(size=20, vjust=2.25, hjust= .5,
face = "bold"),
legend.title = element_text(face="bold"),
legend.text = element_text(face="bold"))
# Load libraries
library(grid)
library(gridExtra)
FINAL.P <- arrangeGrob(Imp.table, PLOT.G, nrow=2,
heights=c(0.2,0.5),vp=viewport(width=0.8,
height=0.8))
FINAL.P2 <- arrangeGrob(Imp.table2, PLOT.G, nrow=2,
heights=c(1,2),vp=viewport(width=0.8,
height=0.8))
# Build a list with the elements of interest
L <- list(Global.plot = FINAL.P,Global.plot2 = FINAL.P2)
}
return(L)
# return(FINAL.P)
}
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# AUXILIARY FUNCTIONS
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# NAME: rename_OTU
# FUNCTION: it renames the rows of an phyloseq object not to have repeated
# names. The idea is to write at genus level for example,
# f_Lactobacillales_g_unclassified
# INPUT: a "phyloseq" object with @tax_table information and the rank
# given as a number of the column of @tax_table.
# OUTPUT: the "taxonomyTable" object with non-repeated modified names.
#------------------------------------------------------------------------------#
#' Rename each taxon
#'
#' \code{rename_OTU} assigns a non - repeated name to each bacteria for a given
#' taxonomic level.
#'
#'
#' @param phy the phyloseq object where the information is contained.
#' @param rank a \code{numeric} value indicating the taxonomic level where the
#' names should be taken. It corresponds with the column number of
#' \code{phy@tax_table} associated to the desired taxonomic rank.
#' @param db the bacterial database used to name each OTU of the phyloseq
#' object: SILVA (\emph{default}) or GreenGenes.
#'
#'
#'
#' @return A vector with the names for each bacteria. It has not repeated names
#' and none of them starts with \emph{Incertae_Sedis} or \emph{unclassified}.
#'
#' @export rename_OTU
rename_OTU <- function(phy,rank, db ="SILVA"){
# Test if the objects are from the expected class
stopifnot(class(phy) == "phyloseq")
stopifnot(class(rank) == "numeric")
# Rank abbreviation (Kingdom, Phylum, Class, Order, Family, Genus, Specie)
Ranking<-c("k","p","c","o","f","g","s")
################################################################################
# AUXILIAR FUNCTION
################################################################################
replace_rare <- function(j, rk, tax_table, Nam, Ranking, db){
# The column of tax_table we are working on
u <- rk[j]
# The first previous column which is not "unclassified"
# If the name is unclassified . . .
if (tax_table[j,u]=="unclassified"){
while(tax_table[j,u] %in% c("unclassified", "Incertae_Sedis")){
u <- u - 1}
# Modify Nam
if (db=="SILVA"){
V<-paste(Ranking[u], tax_table[j,u],
paste(unlist(strsplit(Nam[j],split="_"))[-c(1:2)],
collapse = "_"),
sep="_")
} else {
v<-paste(tax_table[j,u],
paste(unlist(strsplit(Nam[j],split="_"))[-c(1:2)],
collapse = "_"),
sep="_")}
# . . . else if the name is Incertae_Sedis
} else if (tax_table[j,u]=="Incertae_Sedis"){
while(tax_table[j,u] %in% c("unclassified", "Incertae_Sedis")){
u <- u - 1}
if (db=="SILVA"){
# Modify Nam
V<-paste(Ranking[u], tax_table[j,u],
paste(unlist(strsplit(Nam[j],split="_")), collapse = "_"),
sep="_")
} else {
V<-paste(tax_table[j,u],
paste(unlist(strsplit(Nam[j],split="_")), collapse = "_"),
sep="_")
}
}
# Return V
return(V)
}
################################################################################
# Vector with initial names
Nam<-phy@tax_table[,rank]
if (db=="SILVA"){Nam <- paste(Ranking[rank],Nam,sep="_")}
# Make a copy to work with (Nam2)
Nam2<-Nam
# Initial r value (counting the maximum repeated values for a certain name)
r<-2
# Initial value for the number of categories used
i<-1
# A vector with the rank of the name (initially rank value)
rk <- rep(rank, length(Nam))
# While r>1 (while there are repeated names)
while (r>1){
# Repeated names
Rep.Nam<-names(table(Nam)[(table(Nam)>1)])
# Indices with repeated names
Rep.Idx<-which(Nam %in% Rep.Nam)
# Modify rk
rk[Rep.Idx]<- rk[Rep.Idx] - 1
# Modify Nam for Rep.Idx if it is not null
if (length(Rep.Idx)!=0){
if(db=="SILVA"){
Nam[Rep.Idx] <- paste(Ranking[rank-i],phy@tax_table[Rep.Idx,rank-i],
Nam2[Rep.Idx],sep="_")
}else{
Nam[Rep.Idx] <- paste(phy@tax_table[Rep.Idx,rank-i],
Nam2[Rep.Idx],sep="_")
}
}
# Modify r as the number of the maximum repeated name
r<-max(table(Nam))
i<-i+1
}
# Load library
library(qdapRegex)
# Extract the first value for each name
First.NAM <- unlist(lapply(Nam, function(x)
unlist(rm_between(x, "_","_", extract = T))[1]))
# Indices for the names to replace (with the first name as unclassified
# or Incertae_Sedis)
IDX.Rep <- which(First.NAM %in% c("unclassified", "Incertae"))
# If there are unclassified, . . .
if (length(IDX.Rep) !=0){
for (i in 1:length(IDX.Rep)){
Nam[IDX.Rep[i]] <- replace_rare(IDX.Rep[i],
rk,
phy@tax_table,
Nam,
Ranking,
db)
}
}
return(Nam)
}
################################################################################
#------------------------------------------------------------------------------#
# Auxiliar function in order to replace zeros if necesary
#------------------------------------------------------------------------------#
#' Zero replacement for compositional data
#'
#' \code{cmultRepl2} replaces the zeros for a matrix where each row is
#' compositional
#'
#'
#' @param x a \code{matrix} object with the information of variables
#' (\emph{columns}) for each sample (\emph{rows}).
#' @param zero.rep if \emph{"bayes"} the Bayesian - Multiplicative treatment
#' implemented in \code{zCompositions} is applied. If \emph{"one"}, a
#' pseudocount of 1 is added to the whole matrix.
#'
#'
#'
#' @return The initial matrix after the zero - replacement normalized so that
#' each sample's composition sums one.
#'
#'
#' @examples
#'
#' # Load the count matrix (with zeros)
#' x <- HIV[,1:60]
#' # Zero replacement
#' x.non0 <- cmultRepl2(x, zero.rep = "bayes")
#'
#'
#' @export cmultRepl2
cmultRepl2 <- function(x, zero.rep = "bayes"){
# Load library
library(zCompositions)
# If there are zeros, use cmultRepl
if(sum(x==0)!=0){
if (zero.rep =="bayes"){
new.x <- cmultRepl(x, suppress.print = T)
} else if (zero.rep =="one"){
new.x <- x + 1
}
}else { new.x <- x}
# Return new.x
return(new.x)
}
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# NAME: plot.tab
# INPUT:
# dat: the table to represent
# col: vector with two colors, each of them referring to the variables
# appearing in the numerator and in the denominator respectively.
# OUTPUT: a plot with the information given in dat (for the first five columns)
#------------------------------------------------------------------------------#
#' Plots the cross - validated summary table
#'
#'
#'
#' @param dat the \code{data.frame} object to draw obtained from
#' \code{selbal.cv} function.
#' @param col vector with two colors, each of them referring to the variables
#' appearing in the numerator and in the denominator, respectively.
#'
#'
#' @return A colored table with the information of the given \code{data.frame}:
#'
#' \itemize{
#' \item The first column of the table (%) represents the proportion of times
#' a variables has been selected for a balance.
#'
#' \item The second column (\emph{Global}) shows the result for the balance
#' obtained using all the available samples.
#'
#' \item The last three columns represent the most repeated balances in the
#' cross - validation procedure.
#'
#' }
#'
#' @examples
#' #---------------------------------------------------#
#' # 1) Compute a cross - validated balance selection
#' #---------------------------------------------------#
#' # Load data set
#' load("HIV.rda")
#' # Define x and y
#' x <- HIV[,1:60]
#' y <- HIV[,62]
#' # Run the algorithm
#' CV.Bal <- selbal.cv(x,y)
#'
#' #----------------------------------------#
#' # 2) Plot the table
#' #----------------------------------------#
#'
#' plot.tab(CV.Bal[[4]])
#'
#' @export plot.tab
plot.tab <- function(dat, col = c("steelblue1","tomato1")){
# Load library
library(gtable)
# Data.frame dimension
dim.dat<-dim(dat)
# Colnames dat
colnames(dat)<- c("%","Global",paste("BAL", 1:(dim.dat[2]-2), sep=" "))
# Modify dat
dat.l<-data.frame(cbind(c(" ",row.names(dat)),rbind(colnames(dat),dat)))
row.names(dat.l)<-colnames(dat.l)<-NULL
# Build my.data2 with the row.names(my.data) as a first column
dat2<-dat.l
dat2<-apply(dat2,2,function(x) as.character(x))
# Extract some information
nc <- ncol(dat2)
nr <- nrow(dat2)
n <- nc*nr
# Change the values for the background colour
# Legend (NUM = 10, DEN = 9, OTHER <- 0, HEADER/ROW.NAME <- 1)
dat2[dat2=="NUM"] <- 10
dat2[dat2=="DEN"] <- 9
dat2[,1]<-dat2[1,] <- 1
dat2[-1,2]<- 0
dat2[nr,]<- 0
# Information but the last linefor colors
Letra <- as.character(factor(dat2, labels=c("black", "gray15", col[2],
col[1])))
# Delete words from dat, only selecting the numbers
dat1<-dat.l
dat1[-c(1,nrow(dat1)),-c(1,2)]<-" "
dat1[is.na(dat1)]<-" "
# Filling colors
fill <- as.character(factor(t(dat2),labels=c("white", "gray85", col[2],
col[1])))
# Define the background of cells
fill <- lapply(seq_len(n), function(ii) rectGrob(gp=gpar(fill=fill[ii])))
# Some calculations for cell sizes
row_heights <- function(m){
do.call(unit.c, apply(m, 1, function(l)
max(do.call(unit.c, lapply(l, grobHeight)))))
}
col_widths <- function(m){
do.call(unit.c, apply(m, 2, function(l)
max(do.call(unit.c, lapply(l, grobWidth)))))
}
# Object as matrix
label_matrix <- as.matrix(dat1)
nc <- ncol(label_matrix)
nr <- nrow(label_matrix)
n <- nc*nr
# Text written in the table
# Auxiliar values for text justification
pos <- rep(c(0.5, rep(0.96,nr-2),0.5),nc)
jus <- rep(c(0.5, rep(1,nr-2),0.5),nc)
# Third column centered
pos[(nr +1):(2*nr)] <- jus[(nr+1):(2*nr)] <- 0.5
# Define the text characteristics
labels <- lapply(seq_len(n), function(ii)
textGrob(label_matrix[ii],x=pos[ii],
just=jus[ii],
gp=gpar(fontface="bold",col=Letra[ii])))
label_grobs <- matrix(labels, ncol=nc)
# Place labels in a gtable
g <- gtable_matrix("table", grobs=label_grobs,
widths=col_widths(label_grobs) + unit(8,"mm"),
heights=row_heights(label_grobs) + unit(5,"mm"))
# Add the background
g <- gtable_add_grob(g, fill, t=rep(seq_len(nr), each=nc),
l=rep(seq_len(nc), nr), z=0, name="fill")
# Graphical representation
grid.draw(g)
}
#------------------------------------------------------------------------------#
################################################################################
# FUNCTION: logit.cor
################################################################################
#' Computes an association value between a dichotomous variable and a
#' continuous one.
#'
#'
#'
#' @param FIT a \code{glm} object referred to the logistic regression of a
#' dichotomous variable.
#' @param y the response variable (dichotomous).
#' @param logit.acc when \code{y} is dichotomous, the measure to compute for
#' the correlation between \code{y} and the proposed \emph{balance}
#' adjusting for covariates. One of the following values: \code{"Rsq"} (default),
#' \code{"AUC"} or \code{"Tjur"}.
#'
#'
#' @return The association value using the selected method \code{logit.acc}.
#'
#'
#' @export logit.cor
# Define the function logit.cor
logit.cor <- function(FIT, y, covar = NULL,logit.acc){
if (logit.acc == "AUC"){
#d <- as.numeric(auc(y, FIT$fitted.values, quiet=TRUE)) #
if (class(y) == "factor") {numy<-as.numeric(y)-1} else {numy<-y} #new diferences
d <- as.numeric(auc(numy, FIT$fitted.values, quiet=TRUE)) # new diferences
} else if (logit.acc == "Rsq"){
d <- cor(as.numeric(y), FIT$fitted.values)^2
} else if (logit.acc == "Tjur"){
if (class(y) == "factor") {numy<-as.numeric(y)-1} else {numy<-y}
d <- mean(FIT$fitted.values[numy==1]) - mean(FIT$fitted.values[numy==0])
} else if (logit.acc == "Dev"){
f.class <- ifelse (class(y) == "factor", "binomial", "gaussian")
if (class(y) == "factor") {numy<-as.numeric(y)-1} else {numy<-y}
#if (is.null(covar)){
d<-1-(deviance(FIT)/deviance(glm(numy~1,family=f.class))) # proportion of explained deviance
# } else {
# d<-1-(deviance(FIT)/deviance(glm(numy~., data=cbind(numy,covar),family=f.class))) # proportion of explained deviance
# }
# d<-1-(deviance(FIT)/nulldev0) # proportion of explained deviance
}
# Return the value
return(d)
}
#------------------------------------------------------------------------------#
################################################################################
# FUNCTION: rowM
################################################################################
#' Calculates the mean of each row of a matrix (though having only one column).
#'
#'
#'
#' @param x a \code{matrix} object.
#'
#' @return A \code{vector} with the mean of each row of \code{x}, even if the
#' matrix only contains one column.
#'
#' @examples
#' # Build a matrix with one column
#' M <- matrix(rnorm(10), nrow=1)
#' # rowM (resulting on the same matrix M)
#' rowM(M)
#'
#' # Build a matrix
#' M <- matrix (runif(100), nrow=10)
#' # Apply rowM function
#' rowM(M)
#'
#' @export rowM
# Define the function rowM
rowM <- function(x){
if(is.vector(x)) {u <- x
} else { u <- rowMeans(x)}
return(u)
}
#------------------------------------------------------------------------------#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.