#' =======================================================================
#' Feature frequency and stability of feature ranking.
#' History:
#' 01-02-2007: commence
#' 04-02-2007: feature ranking frequency and overlap rate
#' 10-02-2007: add the stability
#' 09-07-2007: output all frequency
#' Arguments:
#' x - a matrix or data frame of feature order
#' rank.cutoff - top feature order cut-off
#' freq.cutoff - feature frequency cut-off
#' References:
#' 1.) Chad A Davis, et al., (2006), Reliable gene signatures for microarray
#' classification: assessment of stability and performance. Bioinformatics,
#' vol.22, no.19, pages 2356 - 2363.
#' 2.) Michiels, S.; Koscielny, S. & Hill, C. Prediction of cancer outcome
#' with microarrays: a multiple random validation strategy. Lancet, 365,
#' 488-492
feat.freq <- function(x, rank.cutoff = 50, freq.cutoff = 0.5) {
x <- as.matrix(x)
x <- x[1:rank.cutoff, , drop = F]
tmp <- table(x)
tmp <- sort(tmp, decreasing = T) / ncol(x)
fre <- as.vector(tmp) #' strip off a character 'x'
names(fre) <- names(tmp)
#' overlap.rate <- sum(fre==1)/rank.cutoff
res <- list(
freq.all = fre,
freq = fre[fre >= freq.cutoff],
rank.cutoff = rank.cutoff,
freq.cutoff = freq.cutoff,
stability = sum(fre) / length(fre)
#' overlap.fs = if (overlap.rate) names(fre[fre==1]) else NULL,
)
return(res)
}
#' ========================================================================
#' lwc-16-02-2007: Calculate the consensus of feature selection by different
#' methods
#' Internal function.
#' Arguments:
#' freq - a list consisting of feature frequency more than a threshold
feat.cons <- function(freq, disp = TRUE) {
#' lwc-18-02-2007: If only retrieve the names, the following code line
#' is enough.
#' fs.names <- unique(unlist(lapply(freq, names)))
fs.names <- lapply(freq, function(x) names(x))
fs.names <- do.call("c", fs.names)
fs.names <- sort(table(fs.names), decreasing = T)
mat <- matrix(
data = NA, nrow = length(fs.names), ncol = length(freq),
dimnames = list(names(fs.names), names(freq))
)
fs.tab <- sapply(names(freq), function(x) {
dname <- names(freq[[x]])
if (disp) mat[dname, x] <- freq[[x]] else mat[dname, x] <- 1
#' mat[dname,x] <- ifelse(disp, freq[[x]],1) #' do not work
return(mat[, x])
})
#' print(fs.tab, na.print="")
return(fs.tab)
}
#' ========================================================================
#' wll-04-07-2007: Multiple feature selectors with resampling procedures.
#' Will give rank table based on the statistics and feature
#' consensus table.
#' wll-12-12-2007: add output of fs.freq and R doc
#' wll-29-04-2008: Wrapper function for multiple feature selection with or
#' without re-sampling.
#' wll-29-05-2008: Add no-resampling code;
#' wll-21-10-2009: Integrate re-sampling and no-resampling
#' lwc-15-02-2010: drop fs.tab.
#' lwc-25-02-2010: minor changes
#' Arguments:
#' x - data matrix or data frame
#' y - class labels (factor)
#' method - a set of feature selections methods
#' pars - validation control parameters.
#' is.resam - Boolean indicator of re-sampling.
feat.mfs <- function(x, y, method, pars = valipars(), is.resam = TRUE,
...) {
res <- lapply(method, function(m) {
cat("\nFeature Selector = :", m, "\n")
flush.console()
if (is.resam) {
feat.rank.re(x, y, method = m, pars = pars, ...)
} else {
#' model <- if (is.function(m)) m
#' else if (is.character(m)) get(m)
#' else eval(m)
model <- get(m)
model(x, y, ...)
}
})
names(res) <- method
fs.rank <- sapply(res, function(x) x$fs.rank)
fs.order <- sapply(res, function(x) x$fs.order)
fs.order <- as.data.frame(fs.order, stringsAsFactors = F)
fs.stats <- if (is.resam) {
sapply(res, function(x) x$fs.stats)
} else {
sapply(res, function(x) x$stats)
}
feat.res <- list(
fs.order = fs.order, fs.rank = fs.rank, fs.stats = fs.stats,
all = res
)
return(feat.res)
}
#' ======================================================================
#' lwc-25-02-2010: Calculate frequency and consensus from results of
#' feat.mfs' with 'is.resam=TRUE'.
#' wll-05-12-2015: Should give an error checking here.
feat.mfs.stab <- function(fs.res, rank.cutoff = 20, freq.cutoff = 0.5) {
order.list <- lapply(fs.res$all, function(x) x$order.list)
freq.all <- lapply(order.list, function(x) {
feat.freq(x, rank.cutoff = rank.cutoff, freq.cutoff = freq.cutoff)
})
fs.freq <- lapply(freq.all, function(x) x$freq)
fs.subs <- lapply(freq.all, function(x) names(x$freq))
fs.stab <- lapply(freq.all, function(x) x$stability)
fs.cons <- feat.cons(fs.freq)
#' print(fs.cons,digits=2,na.print="")
fs <- list(
fs.freq = fs.freq, fs.subs = fs.subs,
fs.stab = fs.stab, fs.cons = fs.cons
)
return(fs)
}
#' =======================================================================
#' lwc-03-09-2010: Plot the stats values of multiple feature selection.
#' lwc-06-09-2010: add fs.tab
#' wll-05-12-2015: should use this function in data analysis
#' Note:
#' Arguments:
#' fs.stats - Stats value of features
#' cumu.plot - A logical value indicating the cumulative scores should be
#' plotted.
feat.mfs.stats <- function(fs.stats, cumu.plot = FALSE, main = "Stats Plot",
ylab = "Values", xlab = "Index of variable", ...) {
fs.stats <- as.matrix(fs.stats)
nam <- colnames(fs.stats)
fs.stats <- lapply(nam, function(x) { #' x = nam[1]
val <- fs.stats[, x] #' if stats is data frame, no names for val.
val <- sort(val, decreasing = T, na.last = T)
})
names(fs.stats) <- nam
#' Get feautes tab based on stats
fs.tab <- lapply(fs.stats, function(x) {
list(fs = names(x), val = x)
})
#' fs.tab <- list2df(un.list(fs.tab))
#' Note-09-03-2010: If you use cumulative scores, you can easily calculate
#' the numbers, such as fix at 80%.
if (cumu.plot) {
st <- lapply(fs.stats, function(x) cumsum(x / sum(x, na.rm = T)))
} else {
st <- fs.stats
}
#' reshape data for plotting
st <- do.call(cbind, st)
st <- cbind(st, idx = 1:nrow(st))
st <- data.frame(st, stringsAsFactors = F)
#' wl-06-11-2021, Sat: use base R function to get long format
st_long <- with(st, data.frame(idx = st$idx, stack(st, select = -idx)))
# st_long <- data.frame(idx = st$idx,stack(st,select = -idx))
st_long <- st_long[c("idx", "ind", "values")]
names(st_long) <- c("idx", "variable", "value")
#' st_l <- reshape::melt(st, id = "idx")
st.p <- xyplot(value ~ idx | variable,
data = st_long,
as.table = T, type = c("g", "l"),
scales = list(cex = .75, relation = "free"),
par.strip.text = list(cex = 0.65),
ylab = ylab, xlab = xlab, main = main, ...
)
st.p
res <- list(stats.tab = fs.tab, stats.long = st_long, stats.p = st.p)
return(res)
}
#' =======================================================================
#' wll-06-11-2008: Use Borda count to get the final feature order
#' Note: Previous name is fs.agg
feat.agg <- function(fs.rank.list) {
fs.score <- apply(fs.rank.list, 1, sum)
fs.order <- order(fs.score, decreasing = F) #' order from best to worst.
fs.rank <- order(fs.order, decreasing = F) #' feature rank score.
names(fs.rank) <- rownames(fs.rank.list)
temp <- names(fs.rank[fs.order])
if (!is.null(temp)) fs.order <- temp
return(list(fs.order = fs.order, fs.rank = fs.rank))
}
#' ======================================================================
#' wll-20-03-2007: resampling-based on feature ranking/selection
#' wll-23-07-2008: some change in loops handling
feat.rank.re <- function(x, y, method, pars = valipars(), tr.idx = NULL,
...) {
#' validity checking
if (missing(x) || missing(y)) {
stop("data set or class are missing")
}
if (length(dim(x)) != 2) {
stop("'x' must be a matrix or data frame")
}
y <- as.factor(y) #' some classifier need it as factor, such as SVM.
if (nrow(x) != length(y)) stop("x and y don't match.")
if (length(unique(y)) < 2) {
stop("Classification needs at least two classes.")
}
if (any(is.na(x)) || any(is.na(y))) {
stop("NA is not permitted in data set or class labels.")
}
n <- nrow(x) #' number of samples
p <- ncol(x) #' size of feature
#' construct index of train data
if (is.null(tr.idx)) {
if (pars$sampling == "cv" && pars$nreps > n) {
pars$sampling <- "loocv"
pars$niter <- 1
}
if (pars$sampling == "cv" && pars$nreps < 2) {
stop("Number of fold (nreps) for cv must greater than 1")
}
tr.idx <- trainind(y, pars = pars)
} else {
pars$sampling <- c("user")
}
pars$niter <- length(tr.idx)
pars$nreps <- length(tr.idx[[1]])
#' feature selection with re-sampling
cat("Iter (", pars$niter, "):", sep = "")
res.all <- lapply(1:pars$niter, function(i) {
cat(" ", i, sep = "")
flush.console()
train.ind <- tr.idx[[i]]
res <- lapply(1:pars$nreps, function(j) {
x.tr <- x[train.ind[[j]], , drop = F]
y.tr <- y[train.ind[[j]]]
do.call(method, c(list(x = x.tr, y = y.tr), list(...)))
})
names(res) <- paste("Reps", 1:pars$nreps, sep = "_")
res
})
cat("\n")
names(res.all) <- paste("Iter", 1:pars$niter, sep = "_")
rank.list <-
lapply(res.all, function(x) as.data.frame(sapply(x, function(y) y$fs.rank)))
order.list <-
lapply(res.all, function(x) as.data.frame(sapply(x, function(y) y$fs.order)))
stats.list <-
lapply(res.all, function(x) as.data.frame(sapply(x, function(y) y$stats)))
rank.list <- do.call("cbind", rank.list)
order.list <- do.call("cbind", order.list)
stats.list <- do.call("cbind", stats.list)
fs.stats <- apply(stats.list, 1, mean)
#' Use Borda count to get the final feature order
fs.score <- apply(rank.list, 1, sum)
fs.order <- order(fs.score, decreasing = F) #' feature order from best to worst.
fs.rank <- order(fs.order, decreasing = F) #' feature rank score.
names(fs.rank) <- rownames(rank.list)
temp <- names(fs.rank[fs.order])
if (!is.null(temp)) {
fs.order <- noquote(temp)
} #' lwc-16-02-2010: Should we remove noquote?
res <- list(
method = method,
fs.order = fs.order, #' feature order
fs.rank = fs.rank, #' feature rank
fs.stats = fs.stats, #' means of stats
rank.list = rank.list, #' full feature rank list
order.list = order.list, #' full feature order list
pars = pars, #' resampling parameters
tr.idx = tr.idx, #' index of training samples.
all = res.all
) #' all results of re-sampling
return(res)
}
#' =======================================================================
#' wll-31-10-2007: feature selection using VIP of PLS.
#' NOTE: This function supports multiple response, Y (i.e. dummy matrix for
#' discriminant). The Mahalanobis distance of VIP is computed as the values
#' for selection of feature/variables.
#' References:
#' 1. Svante Wold et. al., PLS-regression: a basic tool of chemometrics.
#' Chemometrics and Intelligent Laboratory Systems 58(2001), 109-130.
#' 2. Sarah J. Dixon, et.al., Pattern recognition of gas chromatography mass
#' spectrometry of human volatiles in sweat to distinguish the sex of
#' subjects and determine potential discriminatory marker peaks.
#' Chemometrics and Intelligent Laboratory Systems 87(2007), 161-172.
#' 3. Chong, Il-Gyo and Jun, Chi-Hyuck, Performance of some variable
#' selection methods when multicollinearity is present, Chemometrics and
#' Intelligent Laboratory Systems 78(2005), 103-112.
fs.plsvip <- function(x, y, ncomp = 10, ...) {
if (!is.data.frame(x)) x <- as.data.frame(x)
if (length(y) != nrow(x)) {
stop("x and y is not consistent.")
}
val <- plsc(x, y, pls = "oscorespls", ncomp = ncomp)
#' NOTE: Only NIPLS supports VIP values.
pls <- val$pls.out
#' calculate SS
y.lo <- (unclass(pls$Yloadings))^2
x.sc <- colSums(pls$scores^2)
x.sc <- matrix(x.sc, nrow = nrow(y.lo), ncol = ncol(y.lo), byrow = T)
SS <- y.lo * x.sc #' not matrix product %*%.
#' calculate normalised squared weight
W <- (unclass(pls$loading.weights))^2
sumW <- colSums(W)
sumW <- matrix(sumW, nrow = nrow(W), ncol = ncol(W), byrow = T)
W <- W / sumW
SSW <- W %*% t(SS)
sumSS <- apply(SS, 1, sum)
sumSS <- matrix(sumSS, nrow = nrow(SSW), ncol = ncol(SSW), byrow = T)
vip <- sqrt(nrow(SSW) * (SSW / sumSS))
#' vip <- rowSums(abs(vip))
#' Mahalanobis distances
val <- sqrt(mahalanobis(vip, colMeans(vip), cov(vip), inverted = T))
#' feature rank and feature order
fs.order <- order(val, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(val)
nam <- names(val[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = val)
return(res)
}
#' =======================================================================
#' wll-31-10-2007: feature selection using VIP of PLS.
#' NOTE: This function supports multiple response, Y (i.e. dummy matrix for
#' discriminant). The final VIP is the means of absolute value of VIP.
fs.plsvip.1 <- function(x, y, ncomp = 10, ...) {
if (!is.data.frame(x)) x <- as.data.frame(x)
if (length(y) != nrow(x)) {
stop("x and y is not consistent.")
}
val <- plsc(x, y, pls = "oscorespls", ncomp = ncomp)
#' NOTE: Only NIPLS supports VIP values.
pls <- val$pls.out
#' calculate SS
y.lo <- (unclass(pls$Yloadings))^2
x.sc <- colSums(pls$scores^2)
x.sc <- matrix(x.sc, nrow = nrow(y.lo), ncol = ncol(y.lo), byrow = T)
SS <- y.lo * x.sc #' not matrix product %*%.
#' calculate normalised squared weight
W <- (unclass(pls$loading.weights))^2
sumW <- colSums(W)
sumW <- matrix(sumW, nrow = nrow(W), ncol = ncol(W), byrow = T)
W <- W / sumW
SSW <- W %*% t(SS)
sumSS <- apply(SS, 1, sum)
sumSS <- matrix(sumSS, nrow = nrow(SSW), ncol = ncol(SSW), byrow = T)
vip <- sqrt(nrow(SSW) * (SSW / sumSS))
val <- rowMeans(abs(vip))
#' Mahalanobis distances
#' val <- sqrt(mahalanobis(vip, colMeans(vip), cov(vip), inverted=T))
#' feature rank and feature order
fs.order <- order(val, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(val)
nam <- names(val[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = val)
return(res)
}
#' =======================================================================
#' wll-29-10-2007: feature selection using VIP of PLS.
#' NOTE: To calculate VIP, two conditions needs to satisfy:
#' 1.) PLS algorithm is NIPLS;
#' 2.) Y must be single vector, not multiple vector, i.e matrix. Hence
#' for classification, the coding of Y as a single vector is not
#' efficient, especially for the multi-class problem. For two-class
#' problem, coding of 0 and 1 or 1 and -1 may be OK for a single y
#' vector.
fs.plsvip.2 <- function(x, y, ncomp = 10, ...) {
if (!is.data.frame(x)) x <- as.data.frame(x)
if (length(y) != nrow(x)) {
stop("x and y is not consistent.")
}
#' convert to numeric, especially for factor.
y <- as.numeric(y)
#' NOTE: need to consider a nice way to convert
pls <- oscorespls.fit(as.matrix(x), y, ncomp = ncomp)
#' NOTE: Only NIPLS supports VIP values.
#' VIP values (taken from http://mevik.net/work/software/pls.html)
SS <- c(pls$Yloadings)^2 * colSums(pls$scores^2)
Wnorm2 <- colSums(pls$loading.weights^2)
SSW <- sweep(pls$loading.weights^2, 2, SS / Wnorm2, "*")
val <- sqrt(nrow(SSW) * apply(SSW, 1, cumsum) / cumsum(SS))
val <- val[ncomp, ] #' extract VIP values for ncomp components
#' feature rank and feature order
fs.order <- order(val, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(val)
nam <- names(val[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = val)
return(res)
}
#' =========================================================================
#' wll-29-10-2007: feature selection using regression coefficient of PLS.
#' wll-19-11-2015: add 'pls:::' in the front of 'coef.mvr' since it 'hides'
#' in the new version of 'pls'
#' NOTE: I try to use robust estimation of center and covariant matrix by
#' cov.rob in package MASS. But it seems the collinearity problem to
#' appear. Therefore, the simple Mahalanobis distance is used.
#' NOTES: 1.) Mahalanobis distance and leverage are often used to detect
#' outliers especially in the development of linear regression models.
#' A point that has a greater Mahalanobis distance from the rest of
#' the sample population of points is said to have higher leverage
#' since it has a greater influence on the slope or coefficients of
#' the regression equation. (From
#' http://en.wikipedia.org/wiki/Mahalanobis_distance)
fs.pls <- function(x, y, pls = "simpls", ncomp = 10, ...) {
if (!is.data.frame(x)) x <- as.data.frame(x)
if (length(y) != nrow(x)) {
stop("x and y is not consistent.")
}
val <- plsc(x, y, pls = pls, ncomp = ncomp, ...)
#' wl-08-11-2021, Mon: Use this one
coe <- drop(coef(val$pls.out, ncomp = val$ncomp))
# coe <- drop(pls:::coef.mvr(val$pls.out, ncomp = val$ncomp))
#' lwc-14-06-2010: After running plsc, ncomp may change; hence ncomp here
#' use val$ncomp
#' Mahalanobis distances
val <- sqrt(mahalanobis(coe, colMeans(coe), cov(coe), inverted = T))
#' val <- sapply(as.data.frame(t(coe)), function(x) sqrt(sum(x^2)))
#' val <- sapply(as.data.frame(t(coe)), function(x) sum(abs(x)))
#' feature rank and feature order
fs.order <- order(val, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(val)
nam <- names(val[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = val)
return(res)
}
#' ======================================================================
#' wll-30-10-2007: feature selection using loadings of PCA.
#' lwc-22-09-2011: a bug fixed.
#' NOTE: 1. To check the eignenvalue, use screeplot().
#' 2. If it combines with other supervised methods such as fs.rf and
#' fs.anova as the input methods for feat.mfs and feat.mfs.1, the
#' 'thres' should be given explicitly in case of conflicting with 'y'.
#' e.g., fs.m <- c("fs.anova", "fs.rf", "fs.pca") feat.mfs.1(dat, cls,
#' method=fs.m, is.resam=F, thres=0.8)
fs.pca <- function(x, thres = 0.8, ...) {
x <- as.matrix(x)
obj <- prcomp(x, ...)
vars <- obj$sdev^2
vars <- vars / sum(vars) #' Proportion of Variance
cumvars <- cumsum(vars) #' Cumulative Proportion
names(cumvars) <- colnames(obj$rotation)
id <- which(cumvars >= thres)[1]
if (id == 1) id <- 2 #' lwc-22-09-2011:
lo <- obj$rotation[, 1:id] #' selected loadings
#' Mahalanobis distances
#' rob <- cov.rob(lo, method="mve") #' c("mve", "mcd", "classical")
#' val <- sqrt(mahalanobis(lo, rob$center, rob$cov,tol = 1e-7))
val <- sqrt(mahalanobis(lo, colMeans(lo), cov(lo), inverted = T))
#' val <- sapply(as.data.frame(t(lo)), function(x) sqrt(sum(x^2)))
#' val <- sapply(as.data.frame(t(lo)), function(x) sum(abs(x)))
#' feature rank and feature order
fs.order <- order(val, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(val)
nam <- names(val[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = val)
return(res)
}
#' =======================================================================
#' lwc-12-04-2007: feature selection using ratio of between-group
#' to within-group sums of squares (BW).
#' References: Dudoit S, Fridlyand J, Speed T.P. Comparison of
#' discrimination methods for the classification of tumors using gene
#' expression data. J Amer Statist Assoc 2002, 97:7.
#' NOTE: Someone claims that BW ratio for multiclass classification is a
#' modification of the F-ratio statistics for one-way ANOVA.
fs.bw <- function(x, y, ...) {
if (!is.data.frame(x)) x <- as.data.frame(x)
if (length(y) != nrow(x)) {
stop("x and y is not consistent.")
}
bw <- sapply(x, function(z) {
#' z <- x[,1] #' for debug
mn.all <- mean(z)
mn.grp <- tapply(z, y, mean)
tmp.1 <- tmp.2 <- 0
for (i in 1:length(z)) {
cls <- y[i] #' which class
tmp.1 <- tmp.1 + (mn.grp[[cls]] - mn.all)^2
tmp.2 <- tmp.2 + (z[i] - mn.grp[[cls]])^2
}
tmp.1 / tmp.2
})
fs.order <- order(bw, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(bw)
nam <- names(bw[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = bw)
return(res)
}
#' =========================================================================
#' lwc-06-04-07: Feature selection using RELIEF
#' wll-06-04-07: According to the original algorithm, a random sample should
#' be drawn in each computation. But result of each run will be different.
#' So I change the idea and each sample will be used to update the weight.
#' wll-15-10-07: 1. Extend to ReliefF, in which main ideas are that there
#' are k (k>=1, default as 10) nearest hits/misses and all the
#' hits and misses are averaged. 2. Add the user defined
#' number of instances to sample. Default is all instances will
#' be used.
#' References:
#' 1.) KIRA, K. and RENDEL, L. (1992). The Feature Selection Problem :
#' Traditional Methods and a new algorithm. Proc. Tenth National Conference
#' on Artificial Intelligence, MIT Press, 129-134.
#' 2.) KONONENKO, I., SIMEC, E., and ROBNIK-SIKONJA, M. (1997). Overcoming
#' the myopia of induction learning algorithms with RELIEFF. Applied
#' Intelligence Vol7, 1, 39-55.
#' 3.) Igor Kononenko, Estimating Attributes: Analysis and Extensions of
#' RELIEF, European Conference on Machine Learning, Ed. Francesco Bergadano
#' and Luc De Raedt, 1994, 171-182, Springer
#' 4.) MARKO ROBNIK-SIKONJA and IGOR KONONENKO, Theoretical and Empirical
#' Analysis of ReliefF and RReliefF, Machine Learning, 53, 23<U+FFFD>C69, 2003
fs.relief <- function(x, y, m = NULL, k = 10, ...) {
#' Find the nearest neighbors from a matrix
nearest <- function(x, mat, k = 10) {
#' Euclidean distance
dis <- sapply(as.data.frame(t(mat)), function(y) sqrt(sum((x - y)^2)))
k <- min(k, length(dis)) #' wll-21-03-2008: fix a bug spotted by Ian Scott.
#' ind <- which.min(dis)
ind <- sort.list(dis)[1:k]
return(mat[ind, , drop = F])
}
if (!is.matrix(x)) x <- as.matrix(x)
if (!is.factor(y)) y <- as.factor(y)
if (length(y) != nrow(x)) {
stop("x and y is not consistent.")
}
n <- nrow(x)
p <- ncol(x)
gp <- levels(y)
prio <- table(y) / n #' Computing the prior
#' Calculating the range of each feature. range = Max - Min
rng <- sapply(as.data.frame(x), function(x) diff(range(x)))
if (is.null(m)) {
m <- n
} else {
m <- min(m, n)
}
idx <- sample(1:n, m, replace = F)
weight <- rep(0, p)
for (i in idx) {
#' split x by group
dat <- split.data.frame(x[-i, , drop = F], y[-i])
#' find nearest neighbours
near <- lapply(dat, function(z) nearest(x[i, ], z, k = k))
hit <- gp[gp == y[i]]
miss <- gp[gp != y[i]]
delta <- rep(0, p)
for (j in 1:p) {
diff.hit <- -mean(abs(x[i, ][j] - near[[hit]][, j, drop = T]))
diff.miss <- lapply(miss, function(z) {
prio[z] * mean(abs(x[i, ][j] - near[[z]][, j, drop = T]))
})
diff.miss <- do.call("sum", diff.miss)
diff.miss <- diff.miss / (1 - prio[hit])
delta[j] <- (1 / m) * ((diff.hit + diff.miss) / rng[j])
}
#' updat weight
weight <- weight + delta
}
names(weight) <- colnames(x)
fs.order <- order(weight, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(weight)
nam <- names(weight[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = weight)
return(res)
}
#' ========================================================================
#' RFE-SVM feature selection
#' History:
#' lwc-02-11-2006:
#' lwc-15-11-2006: fix fs.len as "power2"
#' lwc-12-01-2007: re-write
#' lwc-17-01-2007: sequence of number of features in decreasing order
#' lwc-18-01-2008: dots argument has a problem. Bu I think it is problem of
#' svm, not dots' problem. So I have to strip off ... here.
#' Does svm treat ... and list(...) differently?
fs.rfe <- function(x, y, fs.len = "power2", ...) {
#' lwc-16-01-2007: avoid multiple actual arguments in calling svm.
#' dots <- list(...)
#' if(hasArg(kernel)) dots$kernel <- NULL
#' LWC-24-10-2006: Calculates the primal variables w which are stored in
#' warray
wts.svm <- function(x) {
#' warray[k,l,] is the weight vector for the binary pb class k against
#' class l
ncl <- length(x$labels)
classk <- rep(1:ncl, x$nSV)
p <- dim(x$SV)[2]
#' array of the weight vectors
warray <- array(0, dim <- c(ncl, ncl, p))
#' loop to use the coefs
for (s in 1:dim(x$SV)[1]) {
for (co in 1:(ncl - 1)) {
#' find the two class problem
k <- classk[s]
l <- ((1:ncl)[-k])[co]
warray[k, l, ] <- warray[k, l, ] + x$coefs[s, co] * x$SV[s, ]
warray[l, k, ] <- warray[l, k, ] + x$coefs[s, co] * x$SV[s, ]
}
}
#' return twice the sum of the absolute value of primal variables w
wts <- apply(abs(warray), 3, sum)
return(wts)
}
y <- as.factor(y) #' 31-03-2007: must be factor if for classification.
p <- ncol(x)
fs.order <- seq(1, p)
#' get feature lengths for SVM-RFE computation.
len <- get.fs.len(p, fs.len = fs.len)
len <- sort(len, decreasing = T) #' must be decreasing order for SVM-RFE
nlen <- length(len)
for (i in 1:nlen) {
#' extract index of feature for this length.
sel <- fs.order[1:len[i]]
#' call SVM with linear kernel.
model <- svm(x[, sel, drop = F], y, kernel = "linear")
#' model <- svm(x[,sel,drop=F], y, kernel = "linear",dots)
#' calculate the weights
wts <- wts.svm(model)
#' sort the feature based on the weights
ord <- order(wts, decreasing = TRUE)
#' update the feature set
fs.order[1:len[i]] <- sel[ord]
}
fs.rank <- order(fs.order)
names(fs.rank) <- colnames(x)
nam <- colnames(x)[fs.order]
if (!is.null(nam)) fs.order <- nam
#' wll-05-07-2007: add stats for consistent with other methods.
#' No other purpose.
stats <- length(fs.rank) - fs.rank
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = stats)
return(res)
}
#' ========================================================================
#' SNR for feature selection
#' wll-20-03-2007: SNR is only for two-class classification.
#' wll-29-10-2008: is similar to Fisher criterion rate: (m1-m2)^2/(v1+v2)
fs.snr <- function(x, y, ...) {
y <- as.factor(y)
if (length(levels(y)) != 2) {
stop("'y' must have two classes")
}
g.mn <- sapply(data.frame(x), function(x) tapply(x, y, mean))
g.sd <- sapply(data.frame(x), function(x) tapply(x, y, sd))
snr <- abs(g.mn[1, ] - g.mn[2, ]) / (g.sd[1, ] + g.sd[2, ])
fs.order <- order(abs(snr), decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(snr)
nam <- names(snr[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = abs(snr))
return(res)
}
#' ========================================================================
#' AUC for feature selection
#' wll-20-03-2007: AUC is only for two-class classification.
fs.auc <- function(x, y, ...) {
y <- as.factor(y)
if (length(levels(y)) != 2) {
stop("'y' must have two classes")
}
levels(y) <- c(0, 1) #' change levels as 1,0
y <- as.numeric(levels(y))[as.integer(y)]
auc <- sapply(as.data.frame(x), function(x) {
y <- y[order(x, decreasing = TRUE)]
tmp <- cumsum(y) / sum(y)
mean(tmp[y == 0])
})
#' library(limma)
#' auc <- sapply(as.data.frame(x),function(z) auROC(y,z))
#' library(verification)
#' auc <- sapply(as.data.frame(x),function(z) roc.area(y,z)$A)
#' library(ROCR)
#' auc <- sapply(as.data.frame(x),function(z)
#' as.numeric(performance(prediction(z,y),measure="auc")@y.values))
auc[auc < 0.5] <- 1 - auc[auc < 0.5]
fs.order <- order(abs(auc), decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(auc)
nam <- names(auc[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = abs(auc))
return(res)
}
#' ========================================================================
#' randomForest for feature selection
#' lwc-20-03-2007:Commence
#' wll-08-12-2015:Use scaled important scores.
#' Note-08-12-2015:
#' 1.) If there are large quantity of zeros in the important score which in
#' turn lead to ties of rank list, the results of reampling in which
#' rank aggregation is used are not reasonable.
#' 2.) Random Forest is random method, which leads to different results for
#' different runs even if the random seed has been set by set.seed().
#' 3.) The application of 'fs.rf' should be limited.
fs.rf <- function(x, y, ...) {
tmp <- randomForest(x, y, importance = T, ...)
meas <- tmp$importance[, ncol(tmp$importance) - 1]
meas[meas <= 0] <- 0
#' Or use the following two lines
if (F) {
meas <- importance(tmp, type = 1, scale = TRUE)
meas <- meas[, 1]
}
fs.order <- order(meas, decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(meas)
nam <- names(meas[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = meas)
return(res)
}
#' ========================================================================
#' wll-29-10-2008: Another version of RF for feature selection based on
#' successively eliminating the least important variables.
fs.rf.1 <- function(x, y, fs.len = "power2", ...) {
y <- as.factor(y)
p <- ncol(x)
fs.order <- seq(1, p) #' initialisation
#' get feature lengths
len <- get.fs.len(p, fs.len = fs.len)
len <- sort(len, decreasing = T) #' must be decreasing order
nlen <- length(len)
for (i in 1:nlen) {
#' extract index of feature for this length.
sel <- fs.order[1:len[i]]
#' call randomForest
rf <- randomForest(x[, sel, drop = F], y,
importance = T,
keep.forest = FALSE, ...
)
imp <- importance(rf, type = 1, scale = TRUE)
imp <- imp[, 1]
#' sort the feature based on the scaled important scores
ord <- order(imp, decreasing = T, na.last = T)
#' update the feature set
fs.order[1:len[i]] <- sel[ord]
}
fs.rank <- order(fs.order)
names(fs.rank) <- colnames(x)
nam <- colnames(x)[fs.order]
if (!is.null(nam)) fs.order <- nam
#' Add stats for consistent with other methods. No other purpose.
stats <- length(fs.rank) - fs.rank
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = stats)
return(res)
}
#' =========================================================================
#' Welch test for feature selection
#' lwc-05-01-2007: commence
#' lwc-16-03-2007: minor change. Note that do not use sort with na.last=T
#' lwc-19-05-2008: Change oneway.test as t.test with dot arguments. So this
#' method supports paired t.test (Welch). And also strip off
#' noquote.
#' wll-17-06-2008: change data.frame as as.data.frame in case of data frame
#' names changed. (e.g., names of a data frame is
#' numeric-like characteristic)
fs.welch <- function(x, y, ...) {
tmp <- sapply(as.data.frame(x), function(x) {
tmp <- t.test(x ~ y, var.equal = F, ...)
#' tmp <- oneway.test(x ~ y,var.equal=F)
c(tmp$statistic, tmp$p.value)
})
stats <- tmp[1, ]
pval <- tmp[2, ]
fs.order <- order(abs(stats), decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(stats)
nam <- names(stats[fs.order])
if (!is.null(nam)) fs.order <- nam #' fs.order <- noquote(nam)
res <- list(
fs.order = fs.order, fs.rank = fs.rank, stats = abs(stats),
pval = pval
)
return(res)
}
#' =========================================================================
#' wll-19-05-2008: Welch test for feature selection
#' NOTE: This function selects features based on the p-values rather than
#' absolute values of statistics. And also supports additional arguments
#' passing, such as paired test or not, and the alternative hypothesis.
fs.welch.1 <- function(x, y, ...) {
tmp <- sapply(as.data.frame(x), function(x) {
tmp <- t.test(x ~ y, var.equal = F, ...)
c(tmp$statistic, tmp$p.value)
})
stats <- tmp[1, ]
pval <- tmp[2, ]
fs.order <- order(pval, decreasing = F, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(pval)
nam <- names(pval[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(
fs.order = fs.order, fs.rank = fs.rank, pval = pval,
stats = stats
)
return(res)
}
#' =========================================================================
#' Wilcoxon test for feature selection
#' lwc-21-06-2010: commence
#' Note: No doc and not export
fs.wilcox <- function(x, y, ...) {
tmp <- sapply(as.data.frame(x), function(x) {
tmp <- wilcox.test(x ~ y, ...)
c(tmp$statistic, tmp$p.value)
})
stats <- tmp[1, ]
pval <- tmp[2, ]
fs.order <- order(abs(stats), decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(stats)
nam <- names(stats[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(fs.order = fs.order, fs.rank = fs.rank, stats = abs(stats),
pval = pval)
return(res)
}
#' =========================================================================
#' ANOVA for feature selection
#' lwc-05-01-2007: commence
#' lwc-16-03-2007: minor change
fs.anova <- function(x, y, ...) {
tmp <- sapply(as.data.frame(x), function(x) {
tmp <- oneway.test(x ~ y, var.equal = T)
c(tmp$statistic, tmp$p.value)
})
stats <- tmp[1, ]
pval <- tmp[2, ]
fs.order <- order(abs(stats), decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(stats)
nam <- names(stats[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(
fs.order = fs.order, fs.rank = fs.rank, stats = abs(stats),
pval = pval
)
return(res)
}
#' =========================================================================
#' Kruskal-Wallis test for feature selection
#' lwc-05-01-2007: commence
#' lwc-16-03-2007: minor change
fs.kruskal <- function(x, y, ...) {
tmp <- sapply(as.data.frame(x), function(x) {
tmp <- kruskal.test(x ~ y)
c(tmp$statistic, tmp$p.value)
})
stats <- tmp[1, ]
pval <- tmp[2, ]
fs.order <- order(abs(stats), decreasing = T, na.last = T)
fs.rank <- order(fs.order)
names(fs.rank) <- names(stats)
nam <- names(stats[fs.order])
if (!is.null(nam)) fs.order <- nam
res <- list(
fs.order = fs.order, fs.rank = fs.rank, stats = abs(stats),
pval = pval
)
return(res)
}
#' =========================================================================
#' Feature ranking validation by error estimation.
#' History:
#' 08-11-2006: commence
#' 09-11-2006: output different errors.
#' 26-11-2006: Borda count for selecting final feature order
#' 10-01-2007: Add user-defined data partitioning
#' 12-10-2007: Add fs.order as a argument. This allows the user to
#' estimate error using a feature order calculated somewhere
#' else. Actually this function replaces rankvali (deprecated).
frankvali.default <- function(dat, cl, cl.method = "svm",
fs.method = "fs.auc", fs.order = NULL,
fs.len = "power2", pars = valipars(),
tr.idx = NULL, ...) {
#' validity checking
if (missing(dat) || missing(cl)) {
stop("data set or class are missing")
}
if (length(dim(dat)) != 2) {
stop("'dat' must be a matrix or data frame")
}
if (!is.factor(cl)) {
stop("cl must be a factor.")
}
if (nrow(dat) != length(cl)) stop("dat and cl don't match.")
if (length(unique(cl)) < 2) {
stop("Classification needs at least two classes.")
}
if (any(is.na(dat)) || any(is.na(cl))) {
stop("NA is not permitted in data set or class labels.")
}
dat <- as.matrix(dat)
rownames(dat) <- NULL # strip off the row names
n <- nrow(dat) #' number of samples
p <- ncol(dat) #' size of feature
len <- get.fs.len(p, fs.len = fs.len)
nlen <- length(len)
#' construct index of train data
if (is.null(tr.idx)) {
if (pars$sampling == "cv" && pars$nreps > n) {
pars$sampling <- "loocv"
}
tr.idx <- trainind(cl, pars = pars)
}
pars$niter <- length(tr.idx)
pars$nreps <- length(tr.idx[[1]])
err.all <- list()
fs.list <- list()
for (i in 1:pars$niter) {
train.ind <- tr.idx[[i]]
res <- list()
#' generic loop for loocv, cv, scv,random and bootstrap.
for (j in 1:length(train.ind)) {
dat.tr <- dat[train.ind[[j]], , drop = F]
cl.tr <- cl[train.ind[[j]]]
dat.te <- dat[-train.ind[[j]], , drop = F]
cl.te <- cl[-train.ind[[j]]]
#' Error estimation of feature selection with fs.order or with rank
#' method.
res[[j]] <- frank.err(dat.tr, cl.tr, dat.te, cl.te,
cl.method = cl.method,
fs.method = fs.method, fs.order = fs.order,
fs.len = fs.len, ...
)
} #' end of j
#' feature ranking list
fs.list[[i]] <- sapply(res, function(x) cbind(x$fs.rank))
rownames(fs.list[[i]]) <- colnames(dat)
#' error estimation
err.all[[i]] <- t(sapply(res, function(x) cbind(x$error)))
colnames(err.all[[i]]) <- len
#' colnames(err.all[[i]]) <- paste("Len_", len, sep="")
} #' End of i
names(err.all) <- paste("Iter_", seq(1, pars$niter), sep = "")
names(fs.list) <- paste("Iter_", seq(1, pars$niter), sep = "")
err.iter <- t(sapply(err.all, function(x) apply(x, 2, mean)))
err.avg <- apply(err.iter, 2, mean)
#' or try
#' err.mat <- do.call("rbind",err.all)
# err.avg <- apply(err.mat,2,mean)
if (is.null(fs.order)) {
#' final feature ranking
#' Use Borda count to get the final feature order
fs.mat <- do.call("cbind", fs.list)
fs.score <- apply(fs.mat, 1, sum)
fs.order <- order(fs.score, decreasing = F) #' fs order from best to worst.
fs.rank <- order(fs.order, decreasing = F) #' fs rank score.
names(fs.rank) <- rownames(fs.mat)
temp <- names(fs.rank[fs.order])
if (!is.null(temp)) {
fs.order <- noquote(temp)
}
} else {
fs.rank <- order2rank(fs.order)
fs.method <- "User defined"
}
ret <- list(
fs.method = fs.method,
cl.method = cl.method,
fs.len = len, #' computational levels
err.avg = err.avg, #' average error
err.iter = err.iter, #' error matrix on each iteration
err.all = err.all, #' all error matrix
fs.order = fs.order, #' final feature order
fs.rank = fs.rank, #' final feature rank
sampling = switch(pars$sampling,
"cv" = "cross validation",
"loocv" = "leave-one-out cross-validation",
"boot" = "bootstrap",
"random" = "randomised validation (holdout)"
),
niter = pars$niter, #' number of iteration
nreps = pars$nreps
)
if (is.null(fs.order)) {
ret$fs.list <- fs.list #' feature list of all computation
}
class(ret) <- "frankvali"
return(ret)
}
#' ========================================================================
frankvali <- function(dat, ...) UseMethod("frankvali")
#' ========================================================================
#' lwc-12-11-2006:
frankvali.formula <- function(formula, data = NULL, ..., subset,
na.action = na.omit) {
call <- match.call()
if (!inherits(formula, "formula")) {
stop("method is only for formula objects")
}
m <- match.call(expand.dots = FALSE)
if (identical(class(eval.parent(m$data)), "matrix")) {
m$data <- as.data.frame(eval.parent(m$data))
}
m$... <- NULL
m[[1]] <- as.name("model.frame")
m$na.action <- na.action
m <- eval(m, parent.frame())
Terms <- attr(m, "terms")
attr(Terms, "intercept") <- 0
x <- model.matrix(Terms, m)
y <- model.extract(m, "response")
attr(x, "na.action") <- attr(y, "na.action") <- attr(m, "na.action")
ret <- frankvali.default(x, y, ..., na.action = na.action)
ret$call <- call
ret$call[[1]] <- as.name("frankvali")
ret$terms <- Terms
if (!is.null(attr(m, "na.action"))) {
ret$na.action <- attr(m, "na.action")
}
class(ret) <- c("frankvali.formula", class(ret))
return(ret)
}
#' =======================================================================
#' lwc-13-11-2006: boxplot the error rate on each iteration or computation.
boxplot.frankvali <- function(x, ...) {
col <- "lightgray"
xlab <- "Numbers of Feature"
ylab <- "Error Rate"
ylim <- c(0, 1.0)
if (x$niter > 1) {
main <- "Error rate on each iteration"
#' tmp <- data.frame(x$err.iter) #' why does data.frame change names?
tmp <- as.data.frame(x$err.iter) #' wll-1706-2008: Is it OK?
colnames(tmp) <- colnames(x$err.iter)
} else {
main <- "Error rate on each computation"
tmp <- as.data.frame(x$err.all) #' tmp <- data.frame(x$err.all)
colnames(tmp) <- colnames(x$err.all)
}
boxplot(tmp, main = main, col = col, xlab = xlab, ylab = ylab, ylim = ylim)
}
#' ========================================================================
print.frankvali <- function(x, digits = 3, ...) {
cat("\nFeature selection method:\t\t", x$fs.method)
cat("\nClassification method:\t\t", x$cl.method)
cat("\nSampling:\t\t", x$sampling)
cat("\nNo. of iteration.:\t", x$niter)
cat("\nNo. of replications:\t", x$nreps)
cat("\nFeature numbers:\t", x$fs.len)
cat("\nAverage error:\t\t", round(x$err.avg, digits))
cat(
"\nFeature order (top 10):\t",
x$fs.order[1:min(10, length(x$fs.order))]
) #' best to worst
cat("\n")
invisible(x)
}
#' ========================================================================
summary.frankvali <- function(object, ...) {
structure(object, class = "summary.frankvali")
}
#' ========================================================================
print.summary.frankvali <- function(x, ...) {
print.frankvali(x)
cat("\nError of each iteration:\n")
print(x$err.iter)
#' cat("\nError of all computation:\n")
#' print(x$err.all)
if (!is.null(x$fs.list)) {
cat("\nFeature ranking List:\n")
print(x$fs.list)
}
}
#' =========================================================================
#' Error estimation of feature ranking on a single data set.
#' History:
#' 08-11-2006: commence
#' 10-11-2006: generalise cl.method.
#' 10-01-2007: minor modification
#' 24-09-2007: Generalise fs.method
#' 12-10-2007: Add fs.order as a argument.
frank.err <- function(dat.tr, cl.tr, dat.te, cl.te, cl.method = "svm",
fs.method = "fs.auc", fs.order = NULL,
fs.len = "power2", ...) {
if (missing(dat.tr) || missing(cl.tr) || missing(dat.te) || missing(cl.te)) {
stop(" training and test data missing")
}
#' feature ranking
if (is.null(fs.order)) {
tmp <- do.call(fs.method, c(list(x = dat.tr, y = cl.tr), list(...)))
fs.order <- tmp$fs.order
fs.rank <- tmp$fs.rank
} else {
fs.rank <- order2rank(fs.order)
fs.method <- "User defined"
}
#' generate feature length for error estimation
p <- ncol(dat.tr)
len <- get.fs.len(p, fs.len = fs.len)
nlen <- length(len)
#' error estimation
error <- numeric(length = nlen)
names(error) <- len
#' names(error) <- paste("Len_", len, sep="")
for (i in 1:nlen) {
#' feature selection
sel <- fs.order[1:len[i]]
error[i] <- classifier(dat.tr[, sel, drop = F], cl.tr,
dat.te[, sel, drop = F], cl.te,
method = cl.method, ...
)$err
}
ret <- list(
cl.method = cl.method, fs.len = len, error = error,
fs.method = fs.method, fs.order = fs.order, fs.rank = fs.rank
)
return(ret)
}
#' =========================================================================
#' wll-29-04-2008: Wrapper function for validation of feature selection by
#' classification.
#' wll-29-10-2008: Give a logical value to validate all fs or not.
#' lwc-07-10-2011: use get.fs.len again but remove the last one.
#' Note: 1. Similar but more complete function is frankvali
#' 2. should change cl.method as method
fs.cl <- function(dat, cl, fs.order = colnames(dat), fs.len = 1:ncol(dat),
cl.method = "svm", pars = valipars(), all.fs = FALSE, ...) {
len <- get.fs.len(ncol(dat), fs.len = fs.len)
if (!all.fs) {
len <- len[1:(length(len) - 1)] #' remove the last one
}
nlen <- length(len)
res <- sapply(1:nlen, function(i) {
id <- fs.order[1:len[i]] #' extract index of selected features
#' cat("\n--Feature Length = :",i,"\n"); flush.console()
res <- aam.cl(dat[, id, drop = F], cl, method = cl.method, pars = pars, ...)
})
res <- t(res)
rownames(res) <- len
res
}
#' =========================================================================
#' lwc-27-06-2011: Wrapper function for validation of feature selection by
#' classification.
#' Note: This function evaluate all features given by user either in
#' individual feature or aggregated features.
fs.cl.1 <- function(dat, cl, fs.order = colnames(dat), cl.method = "svm",
pars = valipars(), agg_f = FALSE, ...) {
len <- length(fs.order)
if (agg_f) {
res <- sapply(1:len, function(i) {
id <- fs.order[1:i] #' aggregation of features
res <- aam.cl(dat[, id, drop = F], cl, method = cl.method, pars = pars, ...)
})
} else {
res <- sapply(1:len, function(i) {
id <- fs.order[i] #' individual feature
res <- aam.cl(dat[, id, drop = F], cl, method = cl.method, pars = pars, ...)
})
}
res <- t(res)
rownames(res) <- 1:len
res
}
#' =========================================================================
#' lwc-27-06-2011: Wrapper function for validation of feature selection by
#' classification.
#' lwc-19-05-2012: replace aam.cl with accest in order to get more results.
#' lwc-22-10-2012: To get aam, call perf.aam.
#' lwc-21-01-2014: To get other outcome such as SE and CI, need to provide
#' extra code scripts. Refer to frankvali. Usages:
#' usages
#' data(abr1)
#' dat <- abr1$pos
#' x <- preproc(dat[, 110:500], method = "log10")
#' y <- factor(abr1$fact$class)
#' dat <- dat.sel(x, y, choices = c("1", "2"))
#' x.1 <- dat[[1]]$dat
#' y.1 <- dat[[1]]$cls
#' pars <- valipars(sampling = "cv", niter = 4, nreps = 4)
#'
#' #' multi-classes
#' fs <- fs.rf(x, y)
#' ord <- fs$fs.order[1:50]
#' res <- fs.cl.2(x, y,
#' fs.order = ord, cl.method = "svm", pars = pars,
#' agg_f = TRUE
#' )
#' perf.aam(res)
#'
#' #' two-classes
#' fs <- fs.rf(x.1, y.1)
#' ord <- fs$fs.order[1:50]
#' res.1 <- fs.cl.2(x.1, y.1,
#' fs.order = ord, cl.method = "svm", pars = pars,
#' agg_f = TRUE
#' )
#' perf.aam(res.1)
#'
fs.cl.2 <- function(dat, cl, fs.order = colnames(dat), cl.method = "svm",
pars = valipars(), agg_f = FALSE, ...) {
len <- length(fs.order)
if (agg_f) {
res <- lapply(1:len, function(i) {
id <- fs.order[1:i] #' aggregation of features
res <- accest(dat[, id, drop = F], cl,
method = cl.method,
pars = pars, ...
)
#' res <- aam.cl(dat[,id, drop=F],cl, method=cl.method, pars=pars,...)
})
} else {
res <- lapply(1:len, function(i) {
id <- fs.order[i] #' individual feature
res <- accest(dat[, id, drop = F], cl,
method = cl.method,
pars = pars, ...
)
#' res <- aam.cl(dat[,id, drop=F],cl, method=cl.method, pars=pars,...)
})
}
#' res <- t(res)
#' rownames(res) <- 1:len
names(res) <- 1:len
res
}
#' =======================================================================
#' lwc-21-01-2014: get average of acc, auc and mar from outcome of fs.cl.2
perf.aam <- function(res) {
tmp <- sapply(res, function(x) { #' x = res[[1]]
acc <- x$acc
auc <- ifelse(!is.null(x$auc), x$auc, NA)
mar <- ifelse(!is.null(x$mar), x$mar, NA)
res <- c(acc = acc, auc = auc, mar = mar)
})
return(t(tmp))
}
#' =======================================================================
#' lwc-24-05-2012: Wrapper function for perf.aam.
perf <- function(res) {
perf <- lapply(res, function(x) perf.aam(x))
}
#' ======================================================================
#' Generate a sequence of feature number
#' History:
#' 25-10-2006: commence
#' 31-10-2006: add user defined sequence
#' 15-11-2006: fix a bug in returning a decreasing vector
#' Usages:
#' get.fs.len(10,fs.len=c(1,5,3,11.2,7.8,23,1,0))
#' get.fs.len(200,fs.len="half")
get.fs.len <- function(p, fs.len = c("power2")) {
if (!is.character(fs.len)) {
fs.len <- as.vector(fs.len)
fs.len <- as.integer(fs.len)
fs.len <- fs.len[fs.len <= p & fs.len > 0]
fs.len <- c(p, fs.len)
x <- unique(fs.len)
} else {
fs.len <- match.arg(fs.len, c("full", "half", "power2"))
if (fs.len == "full") {
x <- seq(p, 1)
} else if (fs.len == "half") {
x <- tmp <- p
while (tmp > 1) {
tmp <- trunc(tmp / 2)
x <- c(x, tmp)
}
} else {
n <- ceiling(log2(p))
x <- 2^(n:0)
x[1] <- p
}
}
#' x <- sort(x,decreasing = T) #' must be decreasing order for SVM-RFE
x <- sort(x, decreasing = F)
return(x)
}
#' ======================================================================
#' lwc-02-02-2007: convert feature rank to feature order
#' NOTE: The vector of fs.rank should have variable names.
#' Usages:
#' load("D:\R_lwc\data\R-W2-GC\31_01_2007\class_rfrankvali_auto.RData")
#' fs.rank.list <- do.call("cbind",fs$rfe$fs.list)
#' tmp <- mt:::rank2order(fs.rank.list[,1])
#' fs.order.list <- sapply(1:ncol(fs.rank.list),
#' function(x) mt:::rank2order(fs.rank.list[,x]))
#' Internal function.
rank2order <- function(fs.rank) {
fs.order <- order(fs.rank)
tmp <- names(fs.rank[fs.order])
if (!is.null(tmp)) fs.order <- tmp
return(fs.order)
}
#' =======================================================================
#' wll-12-03-2007: convert feature order to feature rank
#' Internal function.
order2rank <- function(fs.order) {
fs.rank <- order(fs.order)
names(fs.rank) <- fs.order[fs.rank]
return(fs.rank)
}
#' 1) feat.freq
#' 2) feat.cons
#' 3) feat.mfs
#' 4) feat.mfs.stab
#' 5) feat.mfs.stats
#' 6) feat.agg
#' 7) feat.rank.re
#' 8) fs.plsvip
#' 9) fs.plsvip.1
#' 10) fs.plsvip.2
#' 11) fs.pls
#' 12) fs.pca
#' 13) fs.bw
#' 14) fs.relief
#' 15) fs.rfe
#' 16) fs.snr
#' 17) fs.auc
#' 18) fs.rf
#' 19) fs.rf.1
#' 20) fs.welch
#' 21) fs.welch.1
#' 22) fs.wilcox
#' 23) fs.anova
#' 24) fs.kruskal
#' 25) frankvali.default
#' 26) frankvali
#' 27) frankvali.formula
#' 28) boxplot.frankvali
#' 29) print.frankvali
#' 30) summary.frankvali
#' 31) print.summary.frankvali
#' 32) frank.err
#' 33) fs.cl
#' 34) fs.cl.1
#' 35) fs.cl.2
#' 36) perf.aam
#' 37) perf
#' 38) get.fs.len
#' 39) rank2order
#' 40) order2rank
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.