#' K-Nearest Neighbour correct cross-validation with distance input using parallel processing
#'
#' This function takes a square matrix of distances among specimens of known group membership
#' and returns the results of a leave-one-out correct cross validation identification for each
#' specimen to provide a correct cross-validation percentage.
#'
#' The function is primarily for use with resampling unequal groups to equal sample size a set
#' number of times. This process is carried out with parrallel processing.
#'
#' This function applies both a weighted approach and an unweighted approach and returns both results.
#'
#' Note that this function is faster when datasets are large and/or when greater numbers of resampling
#' iterations are used. For small samples and few resampling iterations the function is unlikely to be
#' much faster, this is because in addition to the time it takes to carry out calculations the parallel
#' processing will need to compile the results at the end. This process adds additional time to the
#' process.
#'
#'
#' @inheritParams KnnDistCV
#' @return Returns a matrix of the leave-one-out classifications for all the specimens along with their known classification.
#'
#'
#' @author Ardern Hulme-Beaman
#' @import doParallel
#' @import parallel
#' @import foreach
#' @export
KnnDistCVPar <- function(DistMat, GroupMembership, K, EqualIter=100, SampleSize=NA, TieBreaker=c('Random', 'Remove', 'Report'), Verbose=FALSE){
#K=2
#DistMat=ProcDTableRes; GroupMembership=Groups; K=10; Equal=TRUE; EqualIter=100
#Weighted=TRUE; TieBreaker='Report'
chr2nu <- function(X){
as.numeric(as.character(X))
}
ParOutput <- function(PreviousResults, ResultList){
NewResults <- PreviousResults
for (i in 1:length(ResultList)){
NewResults[[i]] <- rbind(PreviousResults[[i]], ResultList[[i]])
}
return(NewResults)
}
BalancedGrps <- function(GroupMembership, GroupSize=SampleSize){
#Data=PairwiseShapeDistMat; GroupMembership=chr(Groups[GrpPos])
if (is.na(GroupSize)){
minSampSize <- min(table(as.character(GroupMembership)))
} else {
minSampSize <- GroupSize
}
sampleindex <- 1:length(GroupMembership)
RandSamp <- stats::aggregate(x = sampleindex, by=list(factor(GroupMembership)), sample, size=minSampSize)
Index <- c(t(RandSamp$x))
GroupMem <- c(t(stats::aggregate(RandSamp$Group.1, list(RandSamp$Group.1), rep, minSampSize)$x))
return(list(IndexedLocations=Index, Newfactors=GroupMem))
}
KVote <- function(X, K, Weighting=FALSE, TieBreaker=c('Random', 'Remove', 'Report')){
#VotingData=KIDmat[,1]
#VotingData=c(rep('East', 5), rep('West', 5))
#Weighting=Weighted
#VotingData=cbind(KIDmat[,1], Kweightmat[,1])
VotingData <- X
if (Weighting==FALSE){
if (is.vector(VotingData)){
MajorityVote <- names(which(table(VotingData[1:K])==max(table(VotingData[1:K]))))
} else {
stop('Error: VotingData is not a vector, KVote function expects VotingData to be a vecotr when Weighting=FALSE')
}
} else {
if (dim(VotingData)[2]<2){
stop('Error: VotingData must be a matrix of 2 columns: the first column must be the nearest neighbour classifications, the second column must be the weighting values')
}
WeightingScores <- stats::aggregate(x = 1/chr2nu(VotingData[1:K,2]), by = list(as.factor(VotingData[1:K,1])), FUN = sum)
MajorityVote <- as.character(WeightingScores$Group.1[which(WeightingScores$x==max(WeightingScores$x))])
}
if (length(MajorityVote)>1){
if (TieBreaker=='Random' && length(TieBreaker)==1){
ReturnedVote <- sample(MajorityVote, size = 1)
} else if (TieBreaker=='Remove' && length(TieBreaker)==1){
ReturnedVote <- 'UnIDed'
} else if (TieBreaker=='Report' && length(TieBreaker)==1){
ReturnedVote <- paste(MajorityVote, collapse = '_')
} else {
warning('Warning: TieBreaker not set or not set to recognisible method. Function will revert to Report for TieBreaker argument. If you have supplied a TieBreaker argument please check capitalisation and spelling.')
ReturnedVote <- paste(MajorityVote, collapse = '_')
}
} else {
ReturnedVote <- MajorityVote
}
return(ReturnedVote)
}
ParEqualIter <- function(DistData, GrpMem, ParK=K, ParTieBreaker=TieBreaker, ParVerbose=Verbose){
#DistData=DistMat; GrpMem=GroupMembership; ParK=K; ParTieBreaker='Report'; ParVerbose=FALSE
BalancingGrps <- BalancedGrps(GrpMem, SampleSize)
BalancedDistMat <- DistData[BalancingGrps$IndexedLocations,BalancingGrps$IndexedLocations]
#this sorts each row of the distance matrix individually into an order of smallest to greatest distance
#column order is maintained
AdjSortedDistMat <- SortedDistMat <- BalancedDistMat
rownames(AdjSortedDistMat) <- rownames(SortedDistMat) <- 1:dim(SortedDistMat)[1]
for (i in 1:dim(BalancedDistMat)[1]){
#i <- 2
AdjSortedDistMat[,i] <- sort(BalancedDistMat[,i])
SortedDistMat[,i] <- as.character(BalancingGrps$Newfactors[sort(SortedDistMat[,i], index.return=TRUE)$ix])
}
#Removing the first row bbecause this will represent the distance 0 i.e. the distance from the column specimen to itself
KIDmat <- SortedDistMat[-1,]
Kweightmat <- AdjSortedDistMat[-1,]
KArray <- array(data = NA, dim = c(dim(KIDmat), 2))
KArray[,,1] <- KIDmat
KArray[,,2] <- Kweightmat
dimnames(KArray) <- list(dimnames(Kweightmat)[[1]], dimnames(Kweightmat)[[2]], c('Call', 'Weight'))
WeightedRes <- apply(X = KArray, MARGIN = 2, FUN = KVote, K=ParK, Weighting=TRUE, TieBreaker=ParTieBreaker)
UnweightedRes <- apply(X = KIDmat, MARGIN = 2, FUN = KVote, K=ParK, Weighting=FALSE, TieBreaker=ParTieBreaker)
WeightedCCVPercent <- sum(BalancingGrps$Newfactors==WeightedRes)/length(BalancingGrps$Newfactors)
UnweightedCCVPercent <- sum(BalancingGrps$Newfactors==UnweightedRes)/length(BalancingGrps$Newfactors)
ResultsSummaryTable <- c(WeightedCCVPercent, UnweightedCCVPercent)
if (ParVerbose==TRUE){
ResTable <- cbind(rownames(BalancedDistMat), BalancingGrps$Newfactors, WeightedRes, UnweightedRes)
colnames(ResTable) <- c('ID', 'True.Classification', 'Weighted.Classification', 'Unweighted.Classification')
return(list(ResultsSummaryTable, ResTable))
} else {
return(list(ResultsSummaryTable, NA))
}
}
cores <- parallel::detectCores()
clust <- parallel::makeCluster(cores[1]-1)
doParallel::registerDoParallel(clust)
a <- 1
ParResults <- foreach::foreach(a = 1:EqualIter, .combine = ParOutput) %dopar% {
ParEqualIter(DistData = DistMat, GrpMem = GroupMembership, ParK = K, ParTieBreaker = 'Report', ParVerbose = Verbose)
}
parallel::stopCluster(clust)
colnames(ParResults[[1]]) <- c('Weighted', 'Unweighted')
if (Verbose==TRUE){
names(ParResults) <- c('Iteration.Summaries', 'Verbose.Output')
return(ParResults)
} else {
return(ParResults[[1]])
}
}
#' Stepwise K-Nearest Neighbour correct cross-validation with distance input using parallel processing
#'
#' This function takes a square matrix of distances among specimens of known group membership
#' and returns the results of a leave-one-out correct cross validation identification exercise for
#' each incremental increase in k. The results of the analyses can be plotted to visualise the
#' change in correct identification given changes in k.
#'
#' The function is primarily for use with resampling unequal groups to equal sample size a set
#' number of times. This process is carried out with parrallel processing.
#'
#' This function applies both a weighted approach and an unweighted appraoch and returns both results.
#'
#' Note that this function is faster when datasets are large and/or when greater numbers of resampling
#' iterations are used. For small samples and few resampling iterations the function is unlikely to be
#' much faster, this is because in addition to the time it takes to carry out calculations the parallel
#' processing will need to compile the results at the end. This process adds additional time to the
#' process.
#'
#'
#' @inheritParams KnnDistCVStepwise
#' @return Returns a matrix of the leave-one-out classifications for all the specimens along with their known classificaiton.
#' @details When the \code{PrintProg} is set to TRUE, the \code{\link[svMisc]{progress}} function of the \code{svMisc} package is used.
#'
#'
#' @keywords shape distances
#' @keywords Geometric morphometric distances
#' @author Ardern Hulme-Beaman
#' @import shapes
#' @import svMisc
#' @export
KnnDistCVStepwisePar <- function(DistMat, GroupMembership, Kmax, EqualIter=100, SampleSize=NA, TieBreaker=c('Random', 'Remove', 'Report'), PlotResults=TRUE){
#DistMat = VoleDistMat; GroupMembership = VoleGrps; Kmax = 10; TieBreaker = 'Remove'; PlotResults = TRUE; EqualIter = 100
#Kmax=20
#DistMat=ProcDTableRes; GroupMembership=Groups; K=10; Equal=TRUE; EqualIter=100
#Weighted=TRUE; TieBreaker='Report'
#PrintProg=TRUE
#Verbose=TRUE; Equal=TRUE
MinSamp <- min(table(as.character(GroupMembership)))
if (Kmax>MinSamp){
warning('Kmax is set to higher than the smallest sample size.')
}
chr2nu <- function(X){
as.numeric(as.character(X))
}
ParOutput <- function(PreviousResults, ResultList){
NewResults <- PreviousResults
for (i in 1:length(ResultList)){
NewResults[[i]] <- rbind(PreviousResults[[i]], ResultList[[i]])
}
return(NewResults)
}
BalancedGrps <- function(GroupMembership, GroupSize=SampleSize){
#Data=PairwiseShapeDistMat; GroupMembership=chr(Groups[GrpPos])
if (is.na(GroupSize)){
minSampSize <- min(table(as.character(GroupMembership)))
} else {
minSampSize <- GroupSize
}
sampleindex <- 1:length(GroupMembership)
RandSamp <- stats::aggregate(x = sampleindex, by=list(factor(GroupMembership)), sample, size=minSampSize)
Index <- c(t(RandSamp$x))
GroupMem <- c(t(stats::aggregate(RandSamp$Group.1, list(RandSamp$Group.1), rep, minSampSize)$x))
return(list(IndexedLocations=Index, Newfactors=GroupMem))
}
KVote <- function(X, K, Weighting=FALSE, TieBreaker=c('Random', 'Remove', 'Report')){
#VotingData=KIDmat[,1]
#VotingData=c(rep('East', 5), rep('West', 5))
#Weighting=Weighted
#VotingData=cbind(KIDmat[,1], Kweightmat[,1])
VotingData <- X
if (Weighting==FALSE){
if (is.vector(VotingData)){
MajorityVote <- names(which(table(VotingData[1:K])==max(table(VotingData[1:K]))))
} else {
stop('Error: VotingData is not a vector, KVote function expects VotingData to be a vecotr when Weighting=FALSE')
}
} else {
if (dim(VotingData)[2]<2){
stop('Error: VotingData must be a matrix of 2 columns: the first column must be the nearest neighbour classifications, the second column must be the weighting values')
}
WeightingScores <- stats::aggregate(x = 1/chr2nu(VotingData[1:K,2]), by = list(as.factor(VotingData[1:K,1])), FUN = sum)
MajorityVote <- as.character(WeightingScores$Group.1[which(WeightingScores$x==max(WeightingScores$x))])
}
if (length(MajorityVote)>1){
if (TieBreaker=='Random' && length(TieBreaker)==1){
ReturnedVote <- sample(MajorityVote, size = 1)
} else if (TieBreaker=='Remove' && length(TieBreaker)==1){
ReturnedVote <- 'UnIDed'
} else if (TieBreaker=='Report' && length(TieBreaker)==1){
ReturnedVote <- paste(MajorityVote, collapse = '_')
} else {
warning('Warning: TieBreaker not set or not set to recognisible method. Function will revert to Report for TieBreaker argument. If you have supplied a TieBreaker argument please check capitalisation and spelling.')
ReturnedVote <- paste(MajorityVote, collapse = '_')
}
} else {
ReturnedVote <- MajorityVote
}
return(ReturnedVote)
}
ParEqualIterStepwise <- function(DistData, GrpMem, ParKmax=Kmax, ParTieBreaker=TieBreaker, ParSampleSize=SampleSize){
#DistData=DistMat; GrpMem=GroupMembership; ParKmax=Kmax; ParTieBreaker='Report'; ParVerbose=FALSE; ParSampleSize=NA
BalancingGrps <- BalancedGrps(GrpMem, ParSampleSize)
BalancedDistMat <- DistData[BalancingGrps$IndexedLocations,BalancingGrps$IndexedLocations]
#this sorts each row of the distance matrix individually into an order of smallest to greatest distance
#column order is maintained
AdjSortedDistMat <- SortedDistMat <- BalancedDistMat
rownames(AdjSortedDistMat) <- rownames(SortedDistMat) <- 1:dim(SortedDistMat)[1]
for (i in 1:dim(BalancedDistMat)[1]){
#i <- 2
AdjSortedDistMat[,i] <- sort(BalancedDistMat[,i])
SortedDistMat[,i] <- as.character(BalancingGrps$Newfactors[sort(SortedDistMat[,i], index.return=TRUE)$ix])
}
#Removing the first row bbecause this will represent the distance 0 i.e. the distance from the column specimen to itself
KArray <- array(data = NA, dim = c(dim(SortedDistMat[-1,]), 2))
KArray[,,1] <- as.matrix(SortedDistMat[-1,])
KArray[,,2] <- as.matrix(AdjSortedDistMat[-1,])
dimnames(KArray) <- list(dimnames(AdjSortedDistMat[-1,])[[1]], dimnames(AdjSortedDistMat[-1,])[[2]], c('Call', 'Weight'))
ResultsTable <- list(Unweighted.Results=matrix(NA, nrow = 1, ncol = ParKmax), Weighted.Results=matrix(NA, nrow = 1, ncol = ParKmax))
for (K in 1:ParKmax){
WeightedRes <- apply(X = KArray, MARGIN = 2, FUN = KVote, K=K, Weighting=TRUE, TieBreaker=TieBreaker)
UnweightedRes <- apply(X = KArray[,,1], MARGIN = 2, FUN = KVote, K=K, Weighting=FALSE, TieBreaker=TieBreaker)
WeightedCCVPercent <- sum(BalancingGrps$Newfactors==WeightedRes)/length(BalancingGrps$Newfactors)
UnweightedCCVPercent <- sum(BalancingGrps$Newfactors==UnweightedRes)/length(BalancingGrps$Newfactors)
ResultsTable$Unweighted.Results[1, K] <- UnweightedCCVPercent
ResultsTable$Weighted.Results[1, K] <- WeightedCCVPercent
}
return(ResultsTable)
}
cores <- parallel::detectCores()
clust <- parallel::makeCluster(cores[1]-1)
doParallel::registerDoParallel(clust)
a <- 1
ParResults <- foreach::foreach(a = 1:EqualIter, .combine = ParOutput) %dopar% {
ParEqualIterStepwise(DistData = DistMat, GrpMem = GroupMembership, ParKmax = Kmax, ParTieBreaker = 'Report', ParSampleSize = NA)
}
parallel::stopCluster(clust)
ResultsTable <- ParResults
if (PlotResults==TRUE){
graphics::plot(y = colMeans(ResultsTable$Unweighted.Results), x = 1:Kmax, type = 'n', ylim = c(10,105), ylab = 'CCV %', xlab = 'K')
graphics::abline(h = seq(from = 20, to = 100, by = 10), v = seq(from = 2, to = Kmax, by =2), lty = '1919')
WeightRange <- apply(ResultsTable$Weighted.Results, MARGIN = 2, FUN = stats::quantile, probs = c(.05, .95))
graphics::polygon(x = c(1:Kmax, Kmax:1),
y = c(WeightRange[1,], WeightRange[2,Kmax:1])*100,
col = transpar('darkblue', alpha = 25),
border = NA)
graphics::lines(y = colMeans(ResultsTable$Weighted.Results*100), x = 1:Kmax, col='darkblue', lwd=3)
UnweightRange <- apply(ResultsTable$Unweighted.Results, MARGIN = 2, FUN = stats::quantile, probs = c(.05, .95))
graphics::polygon(x = c(1:Kmax, Kmax:1),
y = c(UnweightRange[1,], UnweightRange[2,Kmax:1])*100,
col = transpar('lightblue', alpha = 95),
border = NA)
graphics::lines(y = colMeans(ResultsTable$Unweighted.Results*100), x = 1:Kmax, col='lightblue', lwd=3)
graphics::legend('bottomright', legend = c('Weighted', 'Unweighted'), col = c('darkblue', 'lightblue'), lty=1, lwd=3, bty = 'o')
}
return(ResultsTable)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.