R/VCR_knn.R

Defines functions vcr.knn.newdata vcr.knn.train

Documented in vcr.knn.newdata vcr.knn.train

vcr.knn.train = function(X,y,k){
  #
  # Carries out a k-nearest neighbor classification on the
  # training data. Each case is predicted from the labels of
  # all cases except its own.
  # Various additional output is produced for the purpose
  # of constructing graphical displays.
  #
  # X        : This can be a rectangular matrix or data frame
  #            of (already standardized) measurements, or a
  #            dist object obtained from dist() or daisy().
  #            Missing values are not allowed.
  # y        : factor with the given (observed) class labels.
  #            There need to be non-missing y in order to be
  #            able to train the classifier.
  # k        : the number of nearest neighbors used. It can
  #            be selected by running cross-validation using
  #            a different package.
  #
  # Values include:
  #   yint      : given labels as integer 1, 2, 3, ...
  #   y         : given labels
  #   levels    : levels of y
  #   predint   : predicted class number. For each case this
  #               is the class with the highest mixture density.
  #   pred      : predicted label of each object.
  #   altint    : alternative class as integer, i.e. the non-given
  #               class number with the highest mixture density.
  #   altlab    : alternative class of each object.
  #   PAC       : probability of alternative class for each object
  #   fig       : distance of each object i from each class g.
  #   farness   : farness of each object to its given class.
  #   ofarness  : For each object i, its lowest farness to any class.
  #               If much above 1, we suspect i may be an outlier.

  # Auxiliary function for kNN:
  #
  predDis = function(nlab,labsi,dissi){
    # Computes the kNN prediction and its average distance.
    # When the highest frequency is tied, the predicted class
    # is the one with lowest average distance from i.
    freqz = rep(0,times=nlab)
    for(j in seq_len(nlab)){ # freqz = frequencies of labels
      freqz[j] = sum(labsi == j, na.rm=T)
    }
    ranking = order(freqz, decreasing=T)
    # positions of most to least frequent labels
    sfreq = freqz[ranking] # frequencies in that order
    predi = NA
    if(sfreq[1] > sfreq[2]) { # frequency has only 1 maximizer
      predi = ranking[1] # predicted class
      averdis = sum(dissi[labsi == predi])/sfreq[1]
      seconddis = NA
    } else { # frequency has several maximizers
      multipreds = ranking[which(sfreq == sfreq[1])]
      multi = length(multipreds); multi
      avdis = rep(0,times=multi)
      for(j in seq_len(multi)) { # adds at each tied solution
        avdis[j] = sum(dissi[labsi == multipreds[j]])/sfreq[1]
      }
      bestInMulti = which.min(avdis) # avdis minimizer
      averdis = avdis[bestInMulti] # smallest average distance
      # compute second smallest average distance:
      seconddis = min(avdis[-bestInMulti])
      predi = ranking[bestInMulti] # predicted class
    }
    freq = sfreq[1]
    # if((extpred == T) & (gPred != predi)){
    #   # there is a given prediction, and it is not predi
    #   predi = gPred # reset predi
    #   freqi = freqz[predi] # update frequency
    #   if(freqi > 0) {
    #     averdis = sum(dissi[labsi == predi])/freqi
    #     seconddis = NA
    #   } else { # neighborhood has no case with same color
    #     averdis = NA
    #     seconddis = NA
    #   }
    #   freq = freqi
    # }
    return(list(predi = predi, averdis = averdis, freq = freq,
                seconddis = seconddis))
  }

  # Here the main function starts:
  if(inherits(X,"dist")){ # input is a dist object
    n = cluster::sizeDiss(X); n # 150
    # turn it into a symmetric square matrix:
    X = as.matrix(X); dim(X)
    dismat = X; rm(X) # R's renaming, to save space
    if(sum(is.na(as.vector(dismat))) > 0){
      stop("The input dissimilarity matrix X has NA's.") }
    X = NULL # we don't need to return this
  } else {
    # Assumes X is already standardized
    X = as.matrix(X) # in case it is a data frame
    if(nrow(X) == 1) X=t(X)
    if(sum(is.na(as.vector(X))) > 0){
      stop(paste0("The data matrix X has NA's. You could either",
                  " remove them","\nor make a dissimilarity",
                  " matrix without NA's, e.g. with daisy().")) }
    n = nrow(X)
    dismat = as.matrix(dist(X, method="euclidean"))
    # If you want a different metric, call dist() or daisy() first
    # and then input the result into vcr.knn.train instead of
    # the coordinate matrix.
  }
  if(n < 2) stop("The training data should have more than one case.")
  # Check whether y and its levels are of the right form:
  checked = checkLabels(y, n, training = T)
  # If it did not stop: yint has length n, and its values are in
  # 1,...,nlab without gaps. It is NA where y is: outside indsv.
  lab2int = checked$lab2int
  levels  = checked$levels
  nlab    = length(levels)
  yint    = lab2int(y)
  indsv   = checked$indsv # cases in indsv can be visualized
  nvis    = length(indsv)
  yintv   = yint[indsv]
  #
  # Only neighbors with known class are useful for training:
  # dim(dismat)
  dismat = dismat[,indsv] # has the right colnames
  # The k nearest neighbors of each i, as indices in 1,...,n are:
  sortNgb = t(apply(dismat,1,order))[,-1] # excludes i itself,
  # this is the usual convention for knn on training data.
  # To make the entries case indices in 1,...,n we do:
  sortNgb = apply(sortNgb,2,function(x) indsv[x])
  # summary(sortNgb[7,])
  dismat = t(apply(dismat,1,sort))[,-1] # excludes d(i,i)=0
  sortDis = dismat; rm(dismat) # R's renaming, to save space
  # dim(sortDis)
  # gpreds = F # no external predictions were given
  # if(!is.null(extpreds)){
  #   gpreds = T # external predictions were given
  #   if(!is.vector(extpreds)) stop("extpreds should be a vector")
  #   if(length(extpreds) != n){
  #     stop(paste0("extpreds should have length ",n)) }
  #   if(sum(!(extpreds %in% seq_len(nlab))) > 0) {
  #     stop(paste0("The given extpreds are not all in 1:",nlab)) }
  # }
  #
  # Make neighborhoods:
  #
  # First take care of situations where the k-th distance
  # is tied with (at least) the (k+1)-th. For such an object
  # we include all those tied distances.
  prec = 1e-12
  ktrues = rep(NA, times=n)
  for(i in seq_len(n)){ # i=1
    rowdis = sortDis[i,]
    ktrues[i] = length(which(rowdis < rowdis[k] + prec))
  }
  #
  # Make predictions:
  #
  preds  = matrix(rep(NA, times=2*n),ncol=2)
  # if(gpreds){ preds[,1] = extpreds }
  freqs  = matrix(rep(0, times=3*n),ncol=3) # no NA's, OK
  avedis = matrix(rep(NA, times=3*n),ncol=3)
  #
  for(i in seq_len(n)){ # loop over cases to predict # i=7
    #
    # Compute statistics for the true label, if it exists:
    #
    # print(i)
    ktrue = ktrues[i]
    labsi = yint[sortNgb[i, seq_len(ktrue)]]
    # labels of neighbors exist
    dissi = sortDis[i, seq_len(ktrue)] # dissimilarities from i
    if(!is.na(yint[i])) {
      freqs[i,3] = sum(labsi == yint[i])
      # Note that freqs[i,3] can be zero, but not NA
      # freqs[i,3]
      if(freqs[i,3] > 0) {
        avedis[i,3] = sum(dissi[labsi == yint[i]])/freqs[i,3] }
      #} else { avedis[i,3] = NA }
    }
    #
    # Compute predicted label:
    #
    # extpred    = gpreds # existence of external predictions (T or F)
    # gPred      = if(extpred){ preds[i,1] } else { 0 }
    out1       = predDis(nlab,labsi,dissi)
    # calls the auxiliary function predDis
    preds[i,1] = out1$predi
    freqs[i,1] = out1$freq
    if(!is.na(out1$averdis)){
      avedis[i,1] = out1$averdis } else {
        avedis[i,1] = min(sortDis[i,yint[sortNgb[i,]]==out1$predi]) }
    #
    # Compute second best prediction if there is one:
    #
    if(out1$freq < ktrues[i]) { # else nothing to do
      remNgb      = which(labsi != out1$predi) # remaining neighbors
      labsi       = labsi[remNgb]
      dissi       = dissi[remNgb]
      # extpred     = F # there is no external prediction for this
      # gPred       = 0 # there is no external prediction for this
      out2        = predDis(nlab,labsi,dissi)
      preds[i,2]  = out2$predi   # was initialized as NA
      freqs[i,2]  = out2$freq    # was initialized as 0
      avedis[i,2] = out2$averdis # was initialized as NA
    }
  } # ends loop over cases to predict
  #
  counts = freqs[,c(3,1,2)]
  # given class, predicted class, alternative class
  predint = preds[,1] # integer predictions for all cases,
  #                   # even those with missing y.
  freq1v = freqs[indsv,1] # highest frequency (= of prediction)
  freq2v = freqs[indsv,2] # second highest frequency
  freq3v = freqs[indsv,3] # frequency of self
  pred1v = preds[indsv,1] # prediction
  pred2v = preds[indsv,2] # best non-prediction
  #
  # Compute PAC:
  #
  PAC = altint = rep(NA, n)
  freqalt = freq1v
  altintv = pred1v # preliminary
  matched = which(pred1v == yintv) # prediction == selfclass
  freqalt[matched] = freq2v[matched]
  altintv[matched] = pred2v[matched]
  PACv = freqalt/(freq3v + freqalt)
  PAC[indsv] = PACv
  altint[indsv] = altintv
  #
  ### Compute initial fig
  #
  fig = matrix(rep(NA,nvis*nlab),ncol=nlab)
  # compute fig[i,g] of object i to class g:
  for(i in seq_len(nvis)){ # loop over all cases in indsv
    for(g in seq_len(nlab)){ # loop over classes
      ngbg = which(yint[sortNgb[indsv[i],]] == g)
      if(length(ngbg) > k) { ngbg = ngbg[seq_len(k)] }
      fig[i,g] = median(sortDis[indsv[i],ngbg])
    }
  }
  #
  # Compute farness:
  #
  farout = compFarness(type="knn", testdata = F, yint = yintv,
                       nlab = nlab, X = NULL, fig = fig, d = NULL,
                       figparams = NULL)
  fig = matrix(rep(0,n*nlab),ncol=nlab)
  fig[indsv,] = farout$fig
  farness = ofarness = rep(NA,n)
  farness[indsv] = farout$farness
  ofarness[indsv] = farout$ofarness
  if(nlab==2) altint = 3-yint # for consistency with svm etc.
  # This way altint exists, even if it has frequency zero.
  return(list(yint = yint,
              y = levels[yint],
              levels = levels,
              predint = predint,
              pred = levels[predint],
              altint = altint,
              altlab = levels[altint],
              PAC = PAC,
              figparams = farout$figparams,
              fig = fig,
              farness = farness,
              ofarness = ofarness,
              k = k,
              ktrues = ktrues,
              counts = counts,
              X = X))
}

vcr.knn.newdata = function(Xnew, ynew=NULL, vcr.knn.train.out, LOO=FALSE){
  #
  # Predicts class labels for new data by k nearest neighbors,
  # using the output of vcr.knn.train() on the training data.
  # For new data with available labels in ynew, additional output
  # is produced for constructing graphical displays.
  #
  #   Xnew              : If the training data was a matrix of
  #                       coordinates, Xnew must be such a matrix
  #                       with the same number of columns.
  #                       If the training data was a set of
  #                       dissimilarities, Xnew must be a rectangular
  #                       matrix of dissimilarities, with each row
  #                       containing the dissmilarities of a new
  #                       case to all training cases.
  #                       Missing values are not allowed.
  #   ynew              : factor with class membership of each new
  #                       case. Can be NA for some or all cases.
  #                       If NULL, is assumed to be NA everywhere.
  #   vcr.knn.train.out : output of vcr.knn.train() on the training
  #                       data. This contains nlab, levels and k.
  #   LOO               : leave one out. Only used when testing this
  #                       function on a subset of the training data.
  #                       Default is LOO=F.
  #
  # Values include:
  #   yintnew   : given labels as integer 1, 2, 3, ..., nlab.
  #               Can be NA.
  #   ynew      : labels if yintnew is available, else NA.
  #   levels    : levels of the response, from vcr.da.train.out
  #   predint   : predicted class number, always exists.
  #   pred      : predicted label of each object
  #   altint    : alternative label as integer if yintnew was given,
  #               else NA
  #   altlab    : alternative label if yintnew was given, else NA
  #   PAC       : probability of alternative class for each object
  #               with non-NA yintnew
  #   fig       : distance of each object i from each class g.
  #               Always exists.
  #   farness   : farness of each object to its given class, for
  #               objects with non-NA yintnew.
  #   ofarness  : For each object i, its lowest farness to any class.
  #               Always exists. If much above 1, we suspect i may be
  #               an outlier.

  # Auxiliary function for kNN:
  #
  predDis = function(nlab,labsi,dissi){
    # Computes kNN prediction and its average distance.
    # This works even if the data contains < nlab labels.
    freqz = rep(0,times=nlab)
    for(j in seq_len(nlab)){ # freqz = frequencies of labels
      freqz[j] = sum(labsi == j, na.rm=T)
    }
    ranking = order(freqz, decreasing=T)
    # positions of most to least frequent labels
    sfreq = freqz[ranking] # frequencies in that order
    predi = NA
    if(sfreq[1] > sfreq[2]) { # frequency has only 1 maximizer
      predi = ranking[1] # predicted class
      averdis = sum(dissi[labsi == predi])/sfreq[1]
      seconddis = NA
    } else { # frequency has several maximizers
      multipreds = ranking[which(sfreq == sfreq[1])]
      multi = length(multipreds); multi
      avdis = rep(0,times=multi)
      for(j in seq_len(multi)) { # adds at each tied solution
        avdis[j] = sum(dissi[labsi == multipreds[j]])/sfreq[1]
      }
      bestInMulti = which.min(avdis) # avdis minimizer
      averdis = avdis[bestInMulti] # smallest average distance
      # compute second smallest average distance:
      seconddis = min(avdis[-bestInMulti])
      predi = ranking[bestInMulti] # predicted class
    }
    freq = sfreq[1]
    return(list(predi=predi,averdis=averdis,freq=freq,
                seconddis = seconddis))
  }

  # Here the main function starts:
  Xnew = as.matrix(Xnew)
  if(is.null(vcr.knn.train.out$X)){
    # The training data were given as a dissimilarity matrix.
    # Therefore Xnew must be a dissimilarity matrix too.
    if(is.vector(Xnew)) { Xnew = matrix(Xnew, nrow = 1) }
    Xnew = as.matrix(Xnew)
    if(sum(is.na(as.vector(Xnew))) > 0){
      stop("The dissimilarity matrix Xnew contains NA's.") }
    dismat = Xnew; rm(Xnew) # R's version of rename, to save space.
    ntrain = length(vcr.knn.train.out$yint)
    if(ncol(dismat) != ntrain){
      stop(paste0("\nThe dissimilarity matrix Xnew should have ",ntrain,
                  "\ncolumns, as many as there were training cases.")) }
    n = nrow(dismat) # number of new cases
  } else {
    # The training data was a matrix of coordinates.
    Xnew = as.matrix(Xnew) # in case it is a data frame
    if(nrow(Xnew) == 1) Xnew=t(Xnew)
    d = ncol(vcr.knn.train.out$X)
    if(ncol(Xnew) != d) { stop(
      paste0("Xnew should have ",d,
             " columns, like the training data.")) }
    # Should check column names here (also in da)
    # first vcr.da.train should output $colnames
    if(sum(is.na(as.vector(Xnew))) > 0){
      stop("The data matrix Xnew contains NA's.") }
    ntrain = nrow(vcr.knn.train.out$X)
    n      = nrow(Xnew) # number of new cases
    Xall   = rbind(vcr.knn.train.out$X,Xnew) # has ntrain + n rows
    indn   = (ntrain+1):(ntrain+n) # range of new rows
    dismat = as.matrix(dist(Xall, method="euclidean")
    )[indn, seq_len(ntrain),drop=F]
  }
  levels = vcr.knn.train.out$levels # same names as in training data
  nlab   = length(levels) # number of classes
  #
  if(is.null(ynew)) ynew = factor(rep(NA,n), levels=levels)
  checked = checkLabels(ynew, n, training = F, levels)
  lab2int = checked$lab2int # is a function
  indsv   = checked$indsv   # INDiceS of cases we can Visualize,
  #                         # can even be empty, is OK.
  nvis    = length(indsv)   # can be zero.
  yintnew = rep(NA,n)       # needed to overwrite "new" levels.
  yintnew[indsv] = lab2int(ynew[indsv])
  # Any "new" levels are now set to NA.
  yintv   = yintnew[indsv]  # entries of yintnew that can be visualized
  #
  yint = vcr.knn.train.out$yint # class membership in training data
  k    = vcr.knn.train.out$k
  # For each new case, find its neighbors in the training data:
  sortNgb = t(apply(dismat,1,order))
  if(LOO) sortNgb = sortNgb[,-1] # leave one out, only for testing
  colnames(sortNgb) = NULL # colnames were irrelevant
  # Obtain their dissimilarities:
  dismat  = t(apply(dismat,1,sort)) # to save space
  if(LOO) dismat = dismat[,-1] # leave one out, only for testing
  sortDis = dismat; rm(dismat) # to save space
  #
  # Make neighborhoods:
  #
  # First take care of situations where the k-th distance
  # is tied with (at least) the (k+1)-th. For such an object
  # we include all those tied distances.
  prec = 1e-12
  ktrues = rep(NA, n)
  for(i in seq_len(n)){ # i=1
    rowdis = sortDis[i,]
    ktrues[i] = length(which(rowdis < rowdis[k] + prec))
  }
  ktrues
  #
  # Make predictions:
  #
  preds  = matrix(rep(NA, times=2*n),ncol=2)
  freqs  = matrix(rep(0, times=3*n),ncol=3)
  avedis = matrix(rep(NA, times=3*n),ncol=3)
  #
  for(i in seq_len(n)){ # loop over cases to predict
    #
    # Compute statistics for the true label, if it exists:
    #
    ktrue = ktrues[i]
    labsi = yint[sortNgb[i, seq_len(ktrue)]]
    # labels in training data exist
    dissi = sortDis[i, seq_len(ktrue)]
    # dissim. between i and training data
    if(is.na(yintnew[i])) { # if i has no given label
      freqs[i,3] = 0 } else { # if i does have a given label
        freqs[i,3] = sum(labsi == yintnew[i]) }
    if(freqs[i,3] > 0) {
      avedis[i,3] = sum(dissi[labsi == yintnew[i]])/freqs[i,3]
    } else { avedis[i,3] = NA }
    #
    # Compute predicted label:
    #
    out1       = predDis(nlab,labsi,dissi)
    preds[i,1] = out1$predi
    freqs[i,1] = out1$freq
    if(!is.na(out1$averdis)){
      avedis[i,1] = out1$averdis } else {
        avedis[i,1] = min(sortDis[i,yint[sortNgb[i,]]==out1$predi]) }
    #
    # Compute second best prediction if there is one:
    #
    if(out1$freq < ktrues[i]) { # else nothing to do
      remNgb      = which(labsi != out1$predi) # remaining neighbors
      labsi       = labsi[remNgb]
      dissi       = dissi[remNgb]
      out2        = predDis(nlab,labsi,dissi)
      preds[i,2]  = out2$predi   # was initialized as NA
      freqs[i,2]  = out2$freq    # was initialized as 0
      avedis[i,2] = out2$averdis # was initialized as NA
    }
  } # ends loop over cases to predict
  #
  counts = freqs[,c(3,1,2)]
  # given class, predicted class, alternative class
  predint = preds[,1] # integer predictions for all cases,
  #                   # even those with missing ynew.
  #
  PAC = altint = rep(NA, n)
  if(nvis > 0){ # there are new data with labels
    # We can only compute PAC etc. for cases with available ynew.
    freq1v = freqs[indsv,1]
    freq2v = freqs[indsv,2] # can have (many) zeroes, no NAs
    freq3v = freqs[indsv,3]
    pred1v = preds[indsv,1] # prediction
    pred2v = preds[indsv,2] # best non-prediction
    #
    # Compute PAC:
    #
    freqalt = freq1v # frequency of prediction
    altintv = pred1v # preliminary
    matched = which(pred1v == yintv) # prediction == selfclass
    freqalt[matched] = freq2v[matched] # frequency of alternative
    altintv[matched] = pred2v[matched]
    PACv = freqalt/(freq3v + freqalt)
    PAC[indsv] = PACv # the other entries of PAC remain NA
    altint[indsv] = altintv
    if(nlab==2) altint = 3-yintnew # for consistency with svm etc.
    # When ynew is NA, also altint will be
  }
  #
  # Compute initial fig[i,g] from new cases to training classes:
  #
  fig = matrix(rep(NA,n*nlab),ncol=nlab)
  for(i in seq_len(n)){ # loop over all cases in newdata
    for (g in seq_len(nlab)){ # loop over classes in training data
      ngbg = which(yint[sortNgb[i,]] == g)
      if(length(ngbg) > k) { ngbg = ngbg[seq_len(k)] }
      fig[i,g] = median(sortDis[i,ngbg])
    }
  }
  #
  # Compute farness:
  #
  farout = compFarness(type="knn", testdata = T, yint = yintnew,
                       nlab = nlab, X = NULL, fig = fig, d = NULL,
                       figparams = vcr.knn.train.out$figparams)
  return(list(yintnew = yintnew,
              ynew = levels[yintnew],
              levels = levels,
              predint = predint,
              pred = levels[predint],
              altint = altint,
              altlab = levels[altint],
              PAC = PAC,
              fig = farout$fig,
              farness = farout$farness,
              ofarness = farout$ofarness,
              k = k,
              ktrues = ktrues,
              counts = counts))
}

Try the classmap package in your browser

Any scripts or data that you put into this service are public.

classmap documentation built on July 25, 2021, 5:06 p.m.