Nothing

```
vcr.knn.train = function(X,y,k){
#
# Carries out a k-nearest neighbor classification on the
# training data. Each case is predicted from the labels of
# all cases except its own.
# Various additional output is produced for the purpose
# of constructing graphical displays.
#
# X : This can be a rectangular matrix or data frame
# of (already standardized) measurements, or a
# dist object obtained from dist() or daisy().
# Missing values are not allowed.
# y : factor with the given (observed) class labels.
# There need to be non-missing y in order to be
# able to train the classifier.
# k : the number of nearest neighbors used. It can
# be selected by running cross-validation using
# a different package.
#
# Values include:
# yint : given labels as integer 1, 2, 3, ...
# y : given labels
# levels : levels of y
# predint : predicted class number. For each case this
# is the class with the highest mixture density.
# pred : predicted label of each object.
# altint : alternative class as integer, i.e. the non-given
# class number with the highest mixture density.
# altlab : alternative class of each object.
# PAC : probability of alternative class for each object
# fig : distance of each object i from each class g.
# farness : farness of each object to its given class.
# ofarness : For each object i, its lowest farness to any class.
# If much above 1, we suspect i may be an outlier.
# Auxiliary function for kNN:
#
predDis = function(nlab,labsi,dissi){
# Computes the kNN prediction and its average distance.
# When the highest frequency is tied, the predicted class
# is the one with lowest average distance from i.
freqz = rep(0,times=nlab)
for(j in seq_len(nlab)){ # freqz = frequencies of labels
freqz[j] = sum(labsi == j, na.rm=T)
}
ranking = order(freqz, decreasing=T)
# positions of most to least frequent labels
sfreq = freqz[ranking] # frequencies in that order
predi = NA
if(sfreq[1] > sfreq[2]) { # frequency has only 1 maximizer
predi = ranking[1] # predicted class
averdis = sum(dissi[labsi == predi])/sfreq[1]
seconddis = NA
} else { # frequency has several maximizers
multipreds = ranking[which(sfreq == sfreq[1])]
multi = length(multipreds); multi
avdis = rep(0,times=multi)
for(j in seq_len(multi)) { # adds at each tied solution
avdis[j] = sum(dissi[labsi == multipreds[j]])/sfreq[1]
}
bestInMulti = which.min(avdis) # avdis minimizer
averdis = avdis[bestInMulti] # smallest average distance
# compute second smallest average distance:
seconddis = min(avdis[-bestInMulti])
predi = ranking[bestInMulti] # predicted class
}
freq = sfreq[1]
# if((extpred == T) & (gPred != predi)){
# # there is a given prediction, and it is not predi
# predi = gPred # reset predi
# freqi = freqz[predi] # update frequency
# if(freqi > 0) {
# averdis = sum(dissi[labsi == predi])/freqi
# seconddis = NA
# } else { # neighborhood has no case with same color
# averdis = NA
# seconddis = NA
# }
# freq = freqi
# }
return(list(predi = predi, averdis = averdis, freq = freq,
seconddis = seconddis))
}
# Here the main function starts:
if(inherits(X,"dist")){ # input is a dist object
n = cluster::sizeDiss(X); n # 150
# turn it into a symmetric square matrix:
X = as.matrix(X); dim(X)
dismat = X; rm(X) # R's renaming, to save space
if(sum(is.na(as.vector(dismat))) > 0){
stop("The input dissimilarity matrix X has NA's.") }
X = NULL # we don't need to return this
} else {
# Assumes X is already standardized
X = as.matrix(X) # in case it is a data frame
if(nrow(X) == 1) X=t(X)
if(sum(is.na(as.vector(X))) > 0){
stop(paste0("The data matrix X has NA's. You could either",
" remove them","\nor make a dissimilarity",
" matrix without NA's, e.g. with daisy().")) }
n = nrow(X)
dismat = as.matrix(dist(X, method="euclidean"))
# If you want a different metric, call dist() or daisy() first
# and then input the result into vcr.knn.train instead of
# the coordinate matrix.
}
if(n < 2) stop("The training data should have more than one case.")
# Check whether y and its levels are of the right form:
checked = checkLabels(y, n, training = T)
# If it did not stop: yint has length n, and its values are in
# 1,...,nlab without gaps. It is NA where y is: outside indsv.
lab2int = checked$lab2int
levels = checked$levels
nlab = length(levels)
yint = lab2int(y)
indsv = checked$indsv # cases in indsv can be visualized
nvis = length(indsv)
yintv = yint[indsv]
#
# Only neighbors with known class are useful for training:
# dim(dismat)
dismat = dismat[,indsv] # has the right colnames
# The k nearest neighbors of each i, as indices in 1,...,n are:
sortNgb = t(apply(dismat,1,order))[,-1] # excludes i itself,
# this is the usual convention for knn on training data.
# To make the entries case indices in 1,...,n we do:
sortNgb = apply(sortNgb,2,function(x) indsv[x])
# summary(sortNgb[7,])
dismat = t(apply(dismat,1,sort))[,-1] # excludes d(i,i)=0
sortDis = dismat; rm(dismat) # R's renaming, to save space
# dim(sortDis)
# gpreds = F # no external predictions were given
# if(!is.null(extpreds)){
# gpreds = T # external predictions were given
# if(!is.vector(extpreds)) stop("extpreds should be a vector")
# if(length(extpreds) != n){
# stop(paste0("extpreds should have length ",n)) }
# if(sum(!(extpreds %in% seq_len(nlab))) > 0) {
# stop(paste0("The given extpreds are not all in 1:",nlab)) }
# }
#
# Make neighborhoods:
#
# First take care of situations where the k-th distance
# is tied with (at least) the (k+1)-th. For such an object
# we include all those tied distances.
prec = 1e-12
ktrues = rep(NA, times=n)
for(i in seq_len(n)){ # i=1
rowdis = sortDis[i,]
ktrues[i] = length(which(rowdis < rowdis[k] + prec))
}
#
# Make predictions:
#
preds = matrix(rep(NA, times=2*n),ncol=2)
# if(gpreds){ preds[,1] = extpreds }
freqs = matrix(rep(0, times=3*n),ncol=3) # no NA's, OK
avedis = matrix(rep(NA, times=3*n),ncol=3)
#
for(i in seq_len(n)){ # loop over cases to predict # i=7
#
# Compute statistics for the true label, if it exists:
#
# print(i)
ktrue = ktrues[i]
labsi = yint[sortNgb[i, seq_len(ktrue)]]
# labels of neighbors exist
dissi = sortDis[i, seq_len(ktrue)] # dissimilarities from i
if(!is.na(yint[i])) {
freqs[i,3] = sum(labsi == yint[i])
# Note that freqs[i,3] can be zero, but not NA
# freqs[i,3]
if(freqs[i,3] > 0) {
avedis[i,3] = sum(dissi[labsi == yint[i]])/freqs[i,3] }
#} else { avedis[i,3] = NA }
}
#
# Compute predicted label:
#
# extpred = gpreds # existence of external predictions (T or F)
# gPred = if(extpred){ preds[i,1] } else { 0 }
out1 = predDis(nlab,labsi,dissi)
# calls the auxiliary function predDis
preds[i,1] = out1$predi
freqs[i,1] = out1$freq
if(!is.na(out1$averdis)){
avedis[i,1] = out1$averdis } else {
avedis[i,1] = min(sortDis[i,yint[sortNgb[i,]]==out1$predi]) }
#
# Compute second best prediction if there is one:
#
if(out1$freq < ktrues[i]) { # else nothing to do
remNgb = which(labsi != out1$predi) # remaining neighbors
labsi = labsi[remNgb]
dissi = dissi[remNgb]
# extpred = F # there is no external prediction for this
# gPred = 0 # there is no external prediction for this
out2 = predDis(nlab,labsi,dissi)
preds[i,2] = out2$predi # was initialized as NA
freqs[i,2] = out2$freq # was initialized as 0
avedis[i,2] = out2$averdis # was initialized as NA
}
} # ends loop over cases to predict
#
counts = freqs[,c(3,1,2)]
# given class, predicted class, alternative class
predint = preds[,1] # integer predictions for all cases,
# # even those with missing y.
freq1v = freqs[indsv,1] # highest frequency (= of prediction)
freq2v = freqs[indsv,2] # second highest frequency
freq3v = freqs[indsv,3] # frequency of self
pred1v = preds[indsv,1] # prediction
pred2v = preds[indsv,2] # best non-prediction
#
# Compute PAC:
#
PAC = altint = rep(NA, n)
freqalt = freq1v
altintv = pred1v # preliminary
matched = which(pred1v == yintv) # prediction == selfclass
freqalt[matched] = freq2v[matched]
altintv[matched] = pred2v[matched]
PACv = freqalt/(freq3v + freqalt)
PAC[indsv] = PACv
altint[indsv] = altintv
#
### Compute initial fig
#
fig = matrix(rep(NA,nvis*nlab),ncol=nlab)
# compute fig[i,g] of object i to class g:
for(i in seq_len(nvis)){ # loop over all cases in indsv
for(g in seq_len(nlab)){ # loop over classes
ngbg = which(yint[sortNgb[indsv[i],]] == g)
if(length(ngbg) > k) { ngbg = ngbg[seq_len(k)] }
fig[i,g] = median(sortDis[indsv[i],ngbg])
}
}
#
# Compute farness:
#
farout = compFarness(type="knn", testdata = F, yint = yintv,
nlab = nlab, X = NULL, fig = fig, d = NULL,
figparams = NULL)
fig = matrix(rep(0,n*nlab),ncol=nlab)
fig[indsv,] = farout$fig
farness = ofarness = rep(NA,n)
farness[indsv] = farout$farness
ofarness[indsv] = farout$ofarness
if(nlab==2) altint = 3-yint # for consistency with svm etc.
# This way altint exists, even if it has frequency zero.
return(list(yint = yint,
y = levels[yint],
levels = levels,
predint = predint,
pred = levels[predint],
altint = altint,
altlab = levels[altint],
PAC = PAC,
figparams = farout$figparams,
fig = fig,
farness = farness,
ofarness = ofarness,
k = k,
ktrues = ktrues,
counts = counts,
X = X))
}
vcr.knn.newdata = function(Xnew, ynew=NULL, vcr.knn.train.out, LOO=FALSE){
#
# Predicts class labels for new data by k nearest neighbors,
# using the output of vcr.knn.train() on the training data.
# For new data with available labels in ynew, additional output
# is produced for constructing graphical displays.
#
# Xnew : If the training data was a matrix of
# coordinates, Xnew must be such a matrix
# with the same number of columns.
# If the training data was a set of
# dissimilarities, Xnew must be a rectangular
# matrix of dissimilarities, with each row
# containing the dissmilarities of a new
# case to all training cases.
# Missing values are not allowed.
# ynew : factor with class membership of each new
# case. Can be NA for some or all cases.
# If NULL, is assumed to be NA everywhere.
# vcr.knn.train.out : output of vcr.knn.train() on the training
# data. This contains nlab, levels and k.
# LOO : leave one out. Only used when testing this
# function on a subset of the training data.
# Default is LOO=F.
#
# Values include:
# yintnew : given labels as integer 1, 2, 3, ..., nlab.
# Can be NA.
# ynew : labels if yintnew is available, else NA.
# levels : levels of the response, from vcr.da.train.out
# predint : predicted class number, always exists.
# pred : predicted label of each object
# altint : alternative label as integer if yintnew was given,
# else NA
# altlab : alternative label if yintnew was given, else NA
# PAC : probability of alternative class for each object
# with non-NA yintnew
# fig : distance of each object i from each class g.
# Always exists.
# farness : farness of each object to its given class, for
# objects with non-NA yintnew.
# ofarness : For each object i, its lowest farness to any class.
# Always exists. If much above 1, we suspect i may be
# an outlier.
# Auxiliary function for kNN:
#
predDis = function(nlab,labsi,dissi){
# Computes kNN prediction and its average distance.
# This works even if the data contains < nlab labels.
freqz = rep(0,times=nlab)
for(j in seq_len(nlab)){ # freqz = frequencies of labels
freqz[j] = sum(labsi == j, na.rm=T)
}
ranking = order(freqz, decreasing=T)
# positions of most to least frequent labels
sfreq = freqz[ranking] # frequencies in that order
predi = NA
if(sfreq[1] > sfreq[2]) { # frequency has only 1 maximizer
predi = ranking[1] # predicted class
averdis = sum(dissi[labsi == predi])/sfreq[1]
seconddis = NA
} else { # frequency has several maximizers
multipreds = ranking[which(sfreq == sfreq[1])]
multi = length(multipreds); multi
avdis = rep(0,times=multi)
for(j in seq_len(multi)) { # adds at each tied solution
avdis[j] = sum(dissi[labsi == multipreds[j]])/sfreq[1]
}
bestInMulti = which.min(avdis) # avdis minimizer
averdis = avdis[bestInMulti] # smallest average distance
# compute second smallest average distance:
seconddis = min(avdis[-bestInMulti])
predi = ranking[bestInMulti] # predicted class
}
freq = sfreq[1]
return(list(predi=predi,averdis=averdis,freq=freq,
seconddis = seconddis))
}
# Here the main function starts:
Xnew = as.matrix(Xnew)
if(is.null(vcr.knn.train.out$X)){
# The training data were given as a dissimilarity matrix.
# Therefore Xnew must be a dissimilarity matrix too.
if(is.vector(Xnew)) { Xnew = matrix(Xnew, nrow = 1) }
Xnew = as.matrix(Xnew)
if(sum(is.na(as.vector(Xnew))) > 0){
stop("The dissimilarity matrix Xnew contains NA's.") }
dismat = Xnew; rm(Xnew) # R's version of rename, to save space.
ntrain = length(vcr.knn.train.out$yint)
if(ncol(dismat) != ntrain){
stop(paste0("\nThe dissimilarity matrix Xnew should have ",ntrain,
"\ncolumns, as many as there were training cases.")) }
n = nrow(dismat) # number of new cases
} else {
# The training data was a matrix of coordinates.
Xnew = as.matrix(Xnew) # in case it is a data frame
if(nrow(Xnew) == 1) Xnew=t(Xnew)
d = ncol(vcr.knn.train.out$X)
if(ncol(Xnew) != d) { stop(
paste0("Xnew should have ",d,
" columns, like the training data.")) }
# Should check column names here (also in da)
# first vcr.da.train should output $colnames
if(sum(is.na(as.vector(Xnew))) > 0){
stop("The data matrix Xnew contains NA's.") }
ntrain = nrow(vcr.knn.train.out$X)
n = nrow(Xnew) # number of new cases
Xall = rbind(vcr.knn.train.out$X,Xnew) # has ntrain + n rows
indn = (ntrain+1):(ntrain+n) # range of new rows
dismat = as.matrix(dist(Xall, method="euclidean")
)[indn, seq_len(ntrain),drop=F]
}
levels = vcr.knn.train.out$levels # same names as in training data
nlab = length(levels) # number of classes
#
if(is.null(ynew)) ynew = factor(rep(NA,n), levels=levels)
checked = checkLabels(ynew, n, training = F, levels)
lab2int = checked$lab2int # is a function
indsv = checked$indsv # INDiceS of cases we can Visualize,
# # can even be empty, is OK.
nvis = length(indsv) # can be zero.
yintnew = rep(NA,n) # needed to overwrite "new" levels.
yintnew[indsv] = lab2int(ynew[indsv])
# Any "new" levels are now set to NA.
yintv = yintnew[indsv] # entries of yintnew that can be visualized
#
yint = vcr.knn.train.out$yint # class membership in training data
k = vcr.knn.train.out$k
# For each new case, find its neighbors in the training data:
sortNgb = t(apply(dismat,1,order))
if(LOO) sortNgb = sortNgb[,-1] # leave one out, only for testing
colnames(sortNgb) = NULL # colnames were irrelevant
# Obtain their dissimilarities:
dismat = t(apply(dismat,1,sort)) # to save space
if(LOO) dismat = dismat[,-1] # leave one out, only for testing
sortDis = dismat; rm(dismat) # to save space
#
# Make neighborhoods:
#
# First take care of situations where the k-th distance
# is tied with (at least) the (k+1)-th. For such an object
# we include all those tied distances.
prec = 1e-12
ktrues = rep(NA, n)
for(i in seq_len(n)){ # i=1
rowdis = sortDis[i,]
ktrues[i] = length(which(rowdis < rowdis[k] + prec))
}
ktrues
#
# Make predictions:
#
preds = matrix(rep(NA, times=2*n),ncol=2)
freqs = matrix(rep(0, times=3*n),ncol=3)
avedis = matrix(rep(NA, times=3*n),ncol=3)
#
for(i in seq_len(n)){ # loop over cases to predict
#
# Compute statistics for the true label, if it exists:
#
ktrue = ktrues[i]
labsi = yint[sortNgb[i, seq_len(ktrue)]]
# labels in training data exist
dissi = sortDis[i, seq_len(ktrue)]
# dissim. between i and training data
if(is.na(yintnew[i])) { # if i has no given label
freqs[i,3] = 0 } else { # if i does have a given label
freqs[i,3] = sum(labsi == yintnew[i]) }
if(freqs[i,3] > 0) {
avedis[i,3] = sum(dissi[labsi == yintnew[i]])/freqs[i,3]
} else { avedis[i,3] = NA }
#
# Compute predicted label:
#
out1 = predDis(nlab,labsi,dissi)
preds[i,1] = out1$predi
freqs[i,1] = out1$freq
if(!is.na(out1$averdis)){
avedis[i,1] = out1$averdis } else {
avedis[i,1] = min(sortDis[i,yint[sortNgb[i,]]==out1$predi]) }
#
# Compute second best prediction if there is one:
#
if(out1$freq < ktrues[i]) { # else nothing to do
remNgb = which(labsi != out1$predi) # remaining neighbors
labsi = labsi[remNgb]
dissi = dissi[remNgb]
out2 = predDis(nlab,labsi,dissi)
preds[i,2] = out2$predi # was initialized as NA
freqs[i,2] = out2$freq # was initialized as 0
avedis[i,2] = out2$averdis # was initialized as NA
}
} # ends loop over cases to predict
#
counts = freqs[,c(3,1,2)]
# given class, predicted class, alternative class
predint = preds[,1] # integer predictions for all cases,
# # even those with missing ynew.
#
PAC = altint = rep(NA, n)
if(nvis > 0){ # there are new data with labels
# We can only compute PAC etc. for cases with available ynew.
freq1v = freqs[indsv,1]
freq2v = freqs[indsv,2] # can have (many) zeroes, no NAs
freq3v = freqs[indsv,3]
pred1v = preds[indsv,1] # prediction
pred2v = preds[indsv,2] # best non-prediction
#
# Compute PAC:
#
freqalt = freq1v # frequency of prediction
altintv = pred1v # preliminary
matched = which(pred1v == yintv) # prediction == selfclass
freqalt[matched] = freq2v[matched] # frequency of alternative
altintv[matched] = pred2v[matched]
PACv = freqalt/(freq3v + freqalt)
PAC[indsv] = PACv # the other entries of PAC remain NA
altint[indsv] = altintv
if(nlab==2) altint = 3-yintnew # for consistency with svm etc.
# When ynew is NA, also altint will be
}
#
# Compute initial fig[i,g] from new cases to training classes:
#
fig = matrix(rep(NA,n*nlab),ncol=nlab)
for(i in seq_len(n)){ # loop over all cases in newdata
for (g in seq_len(nlab)){ # loop over classes in training data
ngbg = which(yint[sortNgb[i,]] == g)
if(length(ngbg) > k) { ngbg = ngbg[seq_len(k)] }
fig[i,g] = median(sortDis[i,ngbg])
}
}
#
# Compute farness:
#
farout = compFarness(type="knn", testdata = T, yint = yintnew,
nlab = nlab, X = NULL, fig = fig, d = NULL,
figparams = vcr.knn.train.out$figparams)
return(list(yintnew = yintnew,
ynew = levels[yintnew],
levels = levels,
predint = predint,
pred = levels[predint],
altint = altint,
altlab = levels[altint],
PAC = PAC,
fig = farout$fig,
farness = farout$farness,
ofarness = farout$ofarness,
k = k,
ktrues = ktrues,
counts = counts))
}
```

**Any scripts or data that you put into this service are public.**

Embedding an R snippet on your website

Add the following code to your website.

For more information on customizing the embed code, read Embedding Snippets.