#' Permutation Analysis for classification
#'
#' simple function to create permutation testing of a classifier
#'
#' @param Data (dataframe) dataframe of the data
#' @param classCol (numeric or string) column number that contains the variable to be predicted
#' @param selectedCols (optional) (numeric or string) all the columns of data that would be used either as predictor or as feature
#' @param classifierFun (optional) (function) classifier function
#' @param nSims (optional) (numeric) number of simulations
#' @param plot (optional) (logical) whether to plot null accuracy distribution
#' @param silent (optional) (logical) whether to print messages or not
#' @param ... (optional) additional arguments for the function
#'
#' @details
#' The function implements Permutation tests for classification.
#' Permutation tests are a set of non-parametric methods for hypothesis testing without assuming
#' a particular distribution (Good, 2005). In case of classification analysis, this requires
#' shuffling the labels of the dataset (i.e. randomly shuffling classes/conditions between observations)
#' and calculating accuracies obtained.
#'
#' @return Returns \code{actualAcc} of the classification analysis,
#' \code{p-value} from permutation testing, \code{nullAcc} distribution of the permutation \code{figure} containing null distribution
#'
#' @examples
#' # perform a permutation testing for 10% of the kinematics movement data#'
#' # not run
#' # PermutationResult <- ClassPerm(Data = KinData, classCol = 1,
#' # selectedCols = c(1,2,12,22,32,42,52,62,72,82,92,102,112), nSims = 1000,cvType = "holdout")
#' # Output:
#' # Performing Permutation Analysis for Classification
#' #
#' # Performing Cross-validation
#' #
#' # Performing holdout Cross-validation
#' # genclassifier was not specified,
#' # Using default value of Classifier.svm (genclassifier = Classifier.svm)
#' #
#' # cvFraction was not specified,
#' # Using default value of 0.8 (cvFraction = 0.8)
#' #
#' # Proportion of Test/Train Data was : 0.2470588
#' # [1] "Test holdout Accuracy is 0.65"
#' # holdout classification Analysis:
#' # cvFraction : 0.8
#' # Test Accuracy 0.65
#' # *Legend:
#' # cvFraction = Fraction of data to keep for training data
#' # Test Accuracy = Accuracy from the Testing dataset
#' #
#' # Performing permutation testing...
#' # Performing 1000 simulations
#' # |=======================================================
#' # ==================================================================|100%
#' # Completed after 2 m
#' # The p-value of the permutation testing is 0.001
#' # p-value generated using the approximate method for p-value calculation.
#' # See Phipson, B. & Gordon K., S. (2010) for details
#'
#'
#' # Using LinearDA instead as function
#' # not run
#' # PermutationResult <- ClassPerm(Data = KinData, classCol = 1,
#' # selectedCols = c(1,2,12,22,32,42,52,62,72,82,92,102,112), nSims = 1000,classifierFun = LinearDA)
#'
#'
#' # Any minimalistic function can be used
#' # The ClassPerm function sends the dataframe Data, classCol,
#' # selectedCols as arguments
#' # not run
#' # myMinimalFun <- function(...){
#' # ***Calculate Error function as you want***
#' # return(accTest)
#' # }
#' # Use the function for permutation testing e.g.
#' # Results <- ClassPerm(Data = KinData, classCol=1,
#' # selectedCols = c(1,2,12,22,32,42,52,62,72,82,92,102,112),
#' # nSims = 1000,classifierFun = myMinimalFun)
#'
#'
#'
#'@import e1071 ggplot2 plyr caret statmod
#'
#'@author
#'Atesh Koul, C'MON unit, Istituto Italiano di Tecnologia
#'
#'\email{atesh.koul@@iit.it}
#'
#'@references
#'Phipson, B., & Smyth, G. K. (2010). Permutation P-values Should Never Be Zero: Calculating Exact P-values When Permutations Are Randomly Drawn.
#'Statistical Applications in Genetics and Molecular Biology, 9(1), 1544-6115.
#'
#'Ojala, M. & Garriga, G. C. Permutation Tests for Studying Classifier Performance. J. Mach. Learn. Res. 11, 1833-1863 (2010).
#'
#'Good, P. (2005). Permutation, Parametric and Bootstrap Tests of Hypotheses. New York: Springer-Verlag.
#'
#' @export
ClassPerm <- function(Data,classCol,selectedCols,classifierFun,nSims=1000,plot=TRUE,silent=FALSE,...){
# classifierFun is a function that the use inputs to calculate the permutation scores
# The form of this function should return accuracy as a single value.
# Extra options should be specified in the classifier function
# Sample classifier function
# load libraries
# library(caret)
# library(ggplot2)
# library(plotly)
# library(plyr)
# a bit complicated to implement a generic way to feed in variable arguements
# match variable input
# extras <- match.call(expand.dots= F)$...
if(!silent) cat("\nPerforming Permutation Analysis for Classification \n\n")
# Modified so that it works even with tibble; force the tibble to be a dataframe
# This is not the best way to proceed; Ideally, all the code should be updated
# to work with tibble
# make it a bit generic to handle matrices as well along with a warning
permittedDataClass <- c("tbl_df","matrix")
if(any(permittedDataClass %in% class(Data))){
warning(cat("the data entered is of the class ",class(Data),". Coersing it to be a dataframe. Check results"))
Data <- as.data.frame(Data)
}
# get the classCol name: in case u enter names of columns, it works anyways
# ensures that we have both correct classCol and predictorColNames as the function expects
if(is.character(classCol)){
predictorColNames <- classCol
classCol <- grep(predictorColNames,names(Data))
}else predictorColNames <- names(Data)[classCol]
if(missing(selectedCols)) selectedCols <- 1:length(names(Data))
# get the features: in case u enter names of columns, it works anyways
ifelse(is.character(selectedCols),selectedColNames <- selectedCols,selectedColNames <- names(Data)[selectedCols])
# old way
# selectedColNames <- names(Data)[selectedCols]
# protection measure if u forgot to put predictor column in the selected list
if(!(predictorColNames %in% selectedColNames)) stop("\n Predictor Column name not present in selected column name list")
# get feature columns without response
featureColNames <- selectedColNames[-grep(names(Data)[classCol],selectedColNames)]
# X <- Data[,featureColNames]
# Y <- Data[predictor]
if (missing(classifierFun)){
# if missing, use the default function of classifyFun from the library with default of svm
classifierFun <- classifyFun
}
# else {
# print(classifierFun)
# classifierFun <- get(classifierFun)
# }
# if your targets are not factors, make them..
if(!(is.factor(Data[,classCol]))) Data[,classCol] <- factor(Data[,classCol])
cat("Performing Cross-validation\n")
set.seed(111)
# First calculate actual accuracy
# Keep the seed constant in this case (we want the same accuracy)
actualAcc <- classifierFun(Data,classCol,selectedCols,SetSeed = FALSE,...)
# calculate permutation scores by randomly sampling targets
chanceAcc <- 1/(length(unique(Data[,classCol])))
# permutator is a simple function that randomly shuffles targets and spits out accuracies
permutator <-function(Data,predictorCol,selectedCols){
Data[,predictorCol] <- sample(Data[,predictorCol])
# use silence to not print the accuracies multiple times
# change the seed; otherwise the sample function above always outputs the same value
# override possible given silent option
silent <- TRUE
NullAcc <- classifierFun(Data,predictorCol,selectedCols,silent=TRUE,SetSeed = FALSE,...)
return(NullAcc)
}
cat("\n \nPerforming permutation testing...\n")
# default to 1000 repetitions
if(!exists("nSims")) nSims <- 1000
cat(paste0("\nPerforming ",nSims," simulations\n \n"))
# important to set seed here not only for reproducibility
# also so that we don't get same results over and over again if
# we set seed in the classification function
#set.seed(111)
# to avoid the note in R CMD check
nullAcc <- NULL
distNull <- data.frame(nullAcc=unlist(rlply(nSims, permutator(Data,classCol,selectedCols),.progress = progress_time())))
# calculate the number of times, the test statistic was greate than observed one
B <- sum(distNull$nullAcc >= actualAcc)
# A generic implementation of emperical p-values (defined by a Monte Carlo procedure)
# by Ojala, M. & Garriga, G. C. (2010) as well as by Good P. (2000):
#
# p_value = (sum(distNull$nullAcc >= actualAcc)+1)/(nSims+1)
#
# This implementation however, might inflate the type 1 error as reported by:
# Phipson, B. & Gordon K., S. (2010).
# The current implementation follows recommendation by Phipson, B. & Gordon K., S.
# and their package statmod for calculating either "exact" or "approximate" p-values
# for higher sample sizes (as is the case with most of classification analysis),
# approximate, instead of exact p-values are used. It's better to choose automatically
# here whether exact or approximate values are used (based on computational power).
#
# I utilize the log of the factorial as direct factorial values can get really high.
# These high values would give you Inf.
#
# The procedure is also generic compared to that from permp that defaults total permutations
# only for 2 classes.
#
# generalise the permp function to use with more than 2 groups
classSizes <- table(Data[,classCol])
lPossibleComb <- lfactorial(sum(classSizes)) - sum(lfactorial(classSizes))
# get the actual number of permutations to create
possibleComb <- exp(lPossibleComb)
# print the method used (for computational sanity)
# equivalent to choose if (lpossibleComb>log(10000)) pMethod <- "approximate"
# Same criteria used as in permp (check if using statmod version other than 1.4.29)
pMethod <- ifelse(possibleComb>10000,"approximate","exact")
p_value <- permp(B,nSims,total.nperm = possibleComb)
cat(paste0("\nThe p-value of the permutation testing is ",signif(p_value,2)))
cat(paste0("\n \np-value generated using the ",pMethod," method for p-value calculation. \nSee Phipson, B. & Gordon K., S. (2010) for details"))
if(plot){
# plot with automatically adjusting the height of the y-axis using 1 sd of the data
plotPerm <- ggplot(distNull,aes(nullAcc))+
#geom_line(aes(x = c(actualAcc,actualAcc),y=c(0,max(density(dframe$x)$y)+ sd(density(dframe$x)$y))),data=dataActual)+
geom_vline(xintercept = actualAcc,colour='blue')+
geom_vline(xintercept = chanceAcc,colour='red')+
geom_density(fill='darkblue',alpha=0.3)+
ggtitle("Permutation curve") +
# geom_curve(x = actualAcc-0.18, xend = actualAcc-0.01, y = max(density(distNull$nullAcc)$y)-0.03,
# yend = max(density(distNull$nullAcc)$y)-0.5,curvature = 0.2,
# arrow = grid::arrow(length = grid::unit(0.03, "npc")))+
# annotate("text", x = actualAcc-0.18,y = max(density(distNull$nullAcc)$y)+0.1,
# colour="blue", label = paste0('Actual Accuracy ',as.character(signif(actualAcc,2))),size=5)+
annotate("text", x = actualAcc-0.03,y = 0.2,
colour="blue", label = as.character(signif(actualAcc,2)),size=4)+
annotate("text", x = chanceAcc-0.03,y = 0.2,
colour="blue", label = 'chance',size=4)+
theme_bw(base_size = 18)+
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
axis.line.y = element_line(color = 'black'),
axis.line.x = element_line(color = 'black'))+
scale_y_continuous(expand = c(0, 0), limits = c(0, max(density(distNull$nullAcc)$y)+ 0.3*sd(density(distNull$nullAcc)$y))) +
scale_x_continuous(expand = c(0.01, 0.01), limits = c(-0.1, 1.01))
print(plotPerm)
}
Results = list(actualAcc = actualAcc,p_value=p_value,nullAcc = distNull$nullAcc,plot=plot,distNull= distNull)
return(Results)
}
# LinearDAPerm <- function(X,Y,cvType="LOTO"){
# #simple function to perform linear discriminant analysis
# # DOn't set seed here as it goes back to the permutator function and constraints the sample
# # to set the same value for Y over and over again.
# library(MASS)
# library(caret)
# if(cvType=="LOTO"){
# index <- createFolds(Y,k=length(Y),list=FALSE)
# acc <- vector()
#
# for(i in seq_along(index)){
# XTrain <- X[-i,]
# YTrain <- Y[-i]
# XTest <- X[i,]
# YTest <- Y[i]
# fit <- lda(XTrain,grouping = YTrain)
# predicted <- predict(fit,newdata=XTest)
# #print(table(predicted$class,DataTest[,predictorCol]))
# acc[i] <- sum(1 * (predicted$class==YTest))/length(predicted$class)
# }
# Acc <- mean(acc)
# #print(paste("The accuracy of discrimination was",signif(Acc,2)))
# }
#
# return(Acc)
#
#
# }
LinearSVM <- function(Data,classCol,selectedCols,SetSeed = T,silent,...){
# a simplistic k-fold crossvalidation
# For cross validation
#library(e1071)
#set.seed(111)
# defaults to 10 fold cross validation
# get the features: in case u enter names of columns, it works anyways
ifelse(is.character(selectedCols),selectedColNames <- selectedCols,selectedColNames <- names(Data)[selectedCols])
# old way
# selectedColNames <- names(Data)[selectedCols]
# get feature columns without response
featureColNames <- selectedColNames[-match(names(Data)[classCol],selectedColNames)]
# get the classCol name: in case u enter names of columns, it works anyways
# ensures that we have both correct classCol and predictorColNames as the function expects
if(is.character(classCol)){
predictorColNames <- classCol
classCol <- grep(predictorColNames,names(Data))
}else predictorColNames <- names(Data)[classCol]
# old way
# predictorColNames <- names(Data)[classCol]
Data = Data[,selectedCols]
Data[,predictorColNames] = factor(Data[,predictorColNames])
k = 10
# use stratified cross validation instead
# use 80% data for training
trainIndex <- createFolds(Data[,classCol], list = FALSE,k = k)
acc <- rep(NA,k)
for (i in 1:k){
trainX <- Data[!trainIndex==i,featureColNames]
testX <- Data[trainIndex==i,featureColNames]
trainY <- Data[!trainIndex==i,predictorColNames]
testY <- Data[trainIndex==i,predictorColNames]
model <- svm(trainX, trainY,kernel = "linear")
# test with train data
pred <- predict(model, testX)
acc[i] <- sum(1 * (pred==testY))/length(pred)
}
# old Method
# folds <- cvFolds(nrow(X),K = k)
# acc <- rep(NA,k)
#
# for (i in 1:k){
# trainX <- X[folds$subsets[folds$which != i], ]
# testX <- X[folds$subsets[folds$which == i], ]
# trainY <- Y[folds$subsets[folds$which != i]]
# testY <- Y[folds$subsets[folds$which == i]]
# model <- svm(trainX, trainY,kernel = "linear")
# # test with train data
# pred <- predict(model, testX)
# acc[i] <- sum(1 * (pred==testY))/length(pred)
# }
return(mean(acc,na.rm=T))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.