Nothing
cvpvs.knn <-
function(X, Y, k = NULL,
distance = c('euclidean', 'ddeuclidean', 'mahalanobis'),
cova = c('standard', 'M', 'sym')) {
cova <- match.arg(cova)
distance <- match.arg(distance)
# Adjust input
X <- as.matrix(X)
n <- NROW(X)
dimension <- NCOL(X)
Ylevels <- levels(factor(Y))
Y <- as.integer(factor(Y))
L <- max(Y)
# Stop if lengths of X[,1] and Y do not match
if(length(Y) != length(X[,1])) {
stop('length(Y) != length(X[,1])')
}
if(any(k >= n)) stop('k >= sample size!')
# Computation of nvec: an L-dimensional column vector, where nvec[b]
# is the number of training observations belonging to class b.
nvec <- rep(0, L)
for(b in seq_len(L)) {
nvec[b] <- sum(Y == b)
# Stop if there are less than two observations from class b
if(nvec[b] == 0) {
stop(paste('no observations from class',
as.character(b),'!'))
}
if(nvec[b] == 1) {
stop(paste('only one observation from class',
as.character(b),'!'))
}
}
# If k is not a single integer, search for the optimal k.
if(length(k) > 1 | is.null(k)) {
if(is.null(k)) {
k <- 2:ceiling(length(Y) / 2)
}
opt.k <- matrix(0, n, L)
# optimal k for correct classes
switch(distance,
'euclidean' = {
# Use Euclidean distance
di.tmp <- as.matrix(dist(X))
for(th in seq_len(L)) {
T <- matrix(0, n, length(k))
thIndex <- Y == th
for(j in which(Y != th)) {
sdi <- sort(di.tmp[j, ])
thIndex[j] <- TRUE
for(l in seq_along(k)) {
Rl <- sdi[k[l]]
T[j, l] <- sum(di.tmp[j, thIndex] <= Rl) / k[l]
}
}
thIndex[j] <- FALSE
T <- colSums(T)
opt.k[thIndex, th] <- k[which.min(T)]
}
},
'ddeuclidean' = {
# Use data driven Euclidean distance
for(m in seq_len(dimension)) {
X[ , m] <- X[ , m] / sd(X[ , m])
}
di.tmp <- as.matrix(dist(X))
for(th in seq_len(L)) {
T <- matrix(0, n, length(k))
thIndex <- Y == th
for(j in which(Y != th)) {
sdi <- sort(di.tmp[j, ])
thIndex[j] <- TRUE
for(l in seq_along(k)) {
Rl <- sdi[k[l]]
T[j, l] <- sum(di.tmp[j, thIndex] <= Rl) / k[l]
}
}
thIndex[j] <- FALSE
T <- colSums(T)
opt.k[thIndex, th] <- k[which.min(T)]
}
},
'mahalanobis' = {
# Use Mahalanobis distance
di.tmp <- matrix(0, n, n)
# Compute sigma
switch(cova,
'standard' = {
# Compute mu
mu <- matrix(0, L, dimension)
for(m in seq_len(L)) {
mu[m, ] = colMeans(X[Y == m, , drop = FALSE])
}
sigma <- sigmaSt(X = X, Y = Y, L = L,
dimension = dimension,
n = n, mu = mu)
},
'M' = {
tmp <- MVTMLE.LDA(X = X, Y = Y, L = L, n = n, nu = 1,
M0=NULL,B0=NULL,
delta=10^(-7),steps=FALSE)
mu <- tmp$M
sigma <- tmp$S
B <- tmp$B
},
'sym' = {
tmp <- sigmaSym(X = X, Y = Y, L = L,
dimension = dimension, n = n,
nvec = nvec,
B = NULL,
nu=0,
delta=10^(-7), prewhitened=TRUE,
steps=FALSE, nmax=500
)
sigma <- tmp$S
B <- tmp$B
} )
# Compute Mahalanobis distances
sigma.inv <- solve(sigma)
for(m in seq_len(n)) {
di.tmp[m, ] <- mahalanobis(X, X[m, ], sigma.inv, inverted = TRUE)
}
for(th in seq_len(L)) {
T <- matrix(0, n, length(k))
thIndex <- Y == th
for(j in which(Y != th)) {
sdi <- sort(di.tmp[j, ])
thIndex[j] <- TRUE
for(l in seq_along(k)) {
Rl <- sdi[k[l]]
T[j, l] <- sum(di.tmp[j, thIndex] <= Rl) / k[l]
}
}
thIndex[j] <- FALSE
T <- colSums(T)
opt.k[thIndex, th] <- k[which.min(T)]
}
} )
# optimal k other classes
if(distance == "euclidean" | distance == 'ddeuclidean') {
# Use Euclidean distance or data driven Euclidean distance
for(th in seq_len(L)) {
thIndex <- Y == th
for(i in which(Y != th)) {
T <- matrix(0, n, length(k))
thIndex[i] <- TRUE
for(j in which(!thIndex)) {
sdi <- sort(di.tmp[j, ])
thIndex[j] <- TRUE
for(l in seq_along(k)) {
Rl <- sdi[k[l]]
T[j, l] <- sum(di.tmp[j, thIndex] <= Rl) / k[l]
}
thIndex[j] <- FALSE
}
thIndex[i] <- FALSE
T <- colSums(T)
opt.k[i, th] <- k[which.min(T)]
}
}
} else {
# Use Mahalanobis distance
for(th in seq_len(L)) {
for(i in which(Y != th)) {
# Add NewX[i] temporarily to group th:
Y.tmp <- Y
Y.tmp[i] <- th
thIndex <- Y.tmp == th
nvec.tmp <- nvec
nvec.tmp[c(Y[i], th)] <- c(nvec[Y[i]] - 1, nvec[th] + 1)
# Adjust sigma
switch(cova,
'standard' = {
sigma.tmp <- sigma - (tcrossprod(X[i,]-mu[Y[i],]) /
(1 - 1 / nvec[Y[i]]) -
tcrossprod(X[i,]-mu[th,]) /
(1 - 1 / nvec[th]) ) / (n-L)
},
'M' = {
sigma.tmp <- MVTMLE.LDA(X = X, Y = Y, L = L, n = n, nu = 1,
M0=mu,B0=B,
delta=10^(-7),steps=FALSE)$S
},
'sym' = {
sigma.tmp <- sigmaSym(X = X, Y = Y.tmp, L = L,
dimension = dimension, n = n,
nvec = nvec.tmp, B = B, nu=0,
delta=10^(-7), prewhitened=TRUE,
steps=FALSE, nmax=500)$S
} )
# Compute Mahalanobis distances
sigmatmp.inv <- solve(sigma.tmp)
for(m in seq_len(n)) {
di.tmp[m, ] <- mahalanobis(X, X[m, ], sigmatmp.inv,
inverted = TRUE)
}
T <- matrix(0, n, length(k))
thIndex.tmp <- thIndex
for(j in which(!thIndex)) {
sdi <- sort(di.tmp[j, ])
thIndex.tmp[j] <- TRUE
for(l in seq_along(k)) {
Rl <- sdi[k[l]]
T[j, l] <- sum(di.tmp[j, thIndex.tmp] <= Rl) / k[l]
}
}
thIndex.tmp[j] <- FALSE
T <- colSums(T)
opt.k[i, th] <- k[which.min(T)]
}
}
}
PV <- matrix(0, n, L)
for(j in unique(c(opt.k))) {
PV[opt.k == j] <- cvpvs.knn(X = X, Y = Y, k = j, distance = distance,
cova = cova)[opt.k == j]
}
attributes(PV)$opt.k <- opt.k
dimnames(PV)[[2]] <- Ylevels
return(PV)
}
# Compute distances
if(distance == "mahalanobis") {
# Use Mahalanobis distance
# Compute sigma
switch(cova,
'standard' = {
# Compute mu
mu <- matrix(0, L, dimension)
for(m in seq_len(L)) {
mu[m, ] = colMeans(X[Y == m, , drop = FALSE])
}
sigma <- sigmaSt(X = X, Y = Y, L = L,
dimension = dimension, n = n, mu = mu)
},
'M' = {
tmp <- MVTMLE.LDA(X = X, Y = Y, L = L, n = n, nu = 1,
M0=NULL,B0=NULL,
delta=10^(-7),steps=FALSE)
mu <- tmp$M
sigma <- tmp$S
B <- tmp$B
},
'sym' = {
tmp <- sigmaSym(X = X, Y = Y, L = L,
dimension = dimension, n = n,
nvec = nvec,
B = NULL,
nu=0,
delta=10^(-7), prewhitened=TRUE,
steps=FALSE, nmax=500
)
sigma <- tmp$S
B <- tmp$B
} )
# Compute Mahalanobis distances
sigma.inv <- solve(sigma)
di <- matrix(0, n, n)
for(m in seq_len(n)) {
di[m, ] <- mahalanobis(X, X[m, ], sigma.inv, inverted = TRUE)
}
# Computation of Rk : a column vector, where Rk[i] is the
# distance between X[i,] and its k-th nearest
# neighbor.
# Nk : Nk[i,b] is the number of training
# observations from class b among the k
# nearest neighbors of X[i,]
Rk <- rep(0, n)
Nk <- matrix(0, n, L)
sdi <- apply(di, 1, sort)
Rk <- sdi[k, ]
for(b in seq_len(L)) {
bIndex <- Y == b
# get number of observations for each class among k
# neighbours:
Nk[ , b] <- rowSums(di[ , bIndex] <= Rk)
}
# Computation of the cross-validated P-values:
PV <- matrix(0, n, L)
# P-values for the correct classes, i.e. PV[i, Y[i]]:
for(th in seq_len(L)) {
thIndex <- Y == th
Tv <- Nk[thIndex, th]
PV[thIndex, th] <- CompPVs(Tv)
}
# P-values for other classes, i.e. PV[i, th], th != Y[i]:
for(i in seq_len(n)) {
for(th in seq_len(L)[-Y[i]]) {
# Adjust nvec
nvec.tmp <- nvec
nvec.tmp[c(Y[i], th)] <- nvec.tmp[c(Y[i], th)] + c(- 1, 1)
# Adjust sigma
switch(cova,
'standard' = sigma.tmp <- sigma -
(tcrossprod(X[i, ] - mu[Y[i], ]) / (1 - 1 / nvec[Y[i]])
- tcrossprod(X[i, ] - mu[th, ]) / (1 - 1 / nvec[th])
) / (n - L),
'M' = {
Y.tmp <- Y
Y.tmp[i] <- th
sigma.tmp <- MVTMLE.LDA(X = X, Y = Y.tmp, L = L, n = n, nu = 1,
M0=mu,B0=B,
delta=10^(-7),steps=FALSE)$S
},
'sym' = {
Y.tmp <- Y
Y.tmp[i] <- th
sigma.tmp <- sigmaSym(X = X, Y = Y.tmp, L = L,
dimension = dimension, n = n,
nvec = nvec.tmp, B = B, nu=0,
delta=10^(-7), prewhitened=TRUE,
steps=FALSE, nmax=500)$S
} )
thIndex <- c(i, which(Y == th))
# Compute Mahalanobis distances
sigmatmp.inv <- solve(sigma.tmp)
for(m in thIndex) {
di[m, ] <- mahalanobis(X, X[m, ], sigmatmp.inv, inverted = TRUE)
}
# Computation of Rk : a column vector, where Rk[i] is the
# distance between X[i,] and its k-th nearest
# neighbor.
# Nk : Nk[i,b] is the number of training
# observations from class b among the k
# nearest neighbors of X[i,]
Rk <- rep(0, n)
Nk <- matrix(0, n, L)
sdi <- apply(di, 1, sort)
Rk <- sdi[k, ]
Nk[ , th] <- rowSums(di[ , thIndex] <= Rk)
Tv <- Nk[thIndex, th]
PV[i, th] <- sum(Tv <= Tv[1]) / nvec.tmp[th]
}
}
dimnames(PV)[[2]] <- Ylevels
return(PV)
} else {
# Use Euclidean distance
if(distance == "ddeuclidean") {
# Use data driven Euclidean distance
for(i in seq_len(dimension)) {
X[ , i] <- X[ , i] / sd(X[ , i])
}
}
di <- as.matrix(dist(X))
}
# Computation of Rk : a column vector, where Rk[i] is the
# distance between X[i,] and its k-th nearest
# neighbor.
# Nk : Nk[i,b] is the number of training
# observations from class b among the k
# nearest neighbors of X[i,]
Rk <- rep(0, n)
Nk <- matrix(0, n, L)
sdi <- apply(di, 1, sort)
Rk <- sdi[k, ]
for(b in seq_len(L)) {
bIndex <- Y == b
# get number of observations for each class among k
# neighbours:
Nk[ , b] <- rowSums(di[ , bIndex] <= Rk)
}
# Computation of the cross-validated P-values:
PV <- matrix(0, n, L)
# P-values for the correct classes, i.e. PV[i, Y[i]]:
for(th in seq_len(L)) {
thIndex <- Y == th
Tv <- Nk[thIndex, th]
PV[thIndex, th] <- CompPVs(Tv)
}
# P-values for other classes, i.e. PV[i, th], th != Y[i]:
for(i in seq_len(n)) {
for(th in seq_len(L)[-Y[i]]) {
thIndex <- c(i, which(Y == th))
Nk_tmp <- Nk
#nvec_tmp <- nvec
RkInd <- di[thIndex, i] <= Rk[thIndex]
Nk_tmp[thIndex, th] <- Nk_tmp[thIndex, th] + RkInd
Tv <- Nk_tmp[thIndex, th]
PV[i, th] <- sum(Tv <= Tv[1]) / (nvec[th] + 1 )
}
}
dimnames(PV)[[2]] <- Ylevels
return(PV)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.