# Pseudocode:
# get platypus parameters and view information
# load labelled data
# divide labelled data in k subsets
# for fold.i in 1:k
# hold out subset fold.i of labelled data
# train platypus.R on all-but-fold.i labelled (and unlabelled) data with the given views # unlabelled data is optional, but making use of unlabelled data is one of the biggest advantages of platypus
# predict fold.i subset on trained platypus views
#' k-fold cross validation for platypus
#'
#' @param view.list List of view objects
#' @param fn.labs File containing outcome labels
#' @param classcol.labs Which column from the labels file to use for learning
#' @param cv.folds number of folds for label learning validation (similar to cross validation folds), default=10
#' @param n.iters Maximal number of iterations for each platypus run, default=100
#' @param majority.threshold.percent Percent agreement required to learn a sample's class label, default=100
#' @param nfolds Number of cross-validation folds
#' @param expanded.output Expanded output: returned result list contains a list of trained views after each iteration, default=FALSE
#' @param updating Updating the accuracies of the single views in each iteration, default=FALSE
#' @param ignore.label Label class to ignore, if any. Defaults to 'intermediate'
#' @param parallel Whether or not to run in parallel mode.
#' @param output.folder Name of the folder where output is stored.
#' @return A list containing fold.accuracy, labelling.matrix,labelling.matrices.views
#' @keywords platypus
#' @export
cv.platypus <- function(view.list,fn.labs,classcol.labs=1,cv.folds=10,n.iters=100,majority.threshold.percent=100,nfolds=10, expanded.output=FALSE,updating=FALSE,ignore.label='intermediate',parallel=FALSE,output.folder=NA) {
## Set debug flag on/off for testing
#flag.debug <- TRUE
flag.debug <- FALSE
if(flag.debug) { print('Debug is on') }
# requireNamespace(methods) # TODO untested # TODO untested
## Create output directory if it doesn't already exist
if(!dir.exists(output.folder)) { dir.create(output.folder) }
## Load the label data
if(flag.debug) { print('Loading labels') }
labs <- load.label.data(fn.labs,classcol.labs)
# Get the the two labels
unique.labels <- get.unique.labels(labs[,classcol.labs],ignore.label)
## Sort the labels in k subsets
# TODO: check to make sure that each sample has data in at least 1 view before assigning a fold
if(is.numeric(cv.folds)){
print(paste('Number of cv folds provided:',cv.folds))
fold.vec <- c(rep(seq(cv.folds), each=floor(dim(labs)[[1]]/cv.folds)),sample(seq(cv.folds), dim(labs)[[1]]-cv.folds*(floor(dim(labs)[[1]]/cv.folds)), replace=FALSE))
fold.vec <- sample(fold.vec)
labs$fold <- fold.vec
} else if(is.vector(cv.folds) & length(cv.folds)==nrow(labs)) {
print('Using provided cv fold assignments')
labs$fold <- cv.folds
} else {
print('ERROR: cv.folds must be a vector or integer')
quit(save='no',status=1)
}
###################
# function to do a cv run for one fold
###################
do.one.cvfold <- function(k) {
print( paste('Processing fold:',k) )
## Define reduced label file
# mark labels of the hold out data set with 'testing', so that they can also be excluded from the unlabelled data in the platypus.R
labs.reduced <- labs
labs.reduced[which(labs$fold == k),classcol.labs] <- 'testing'
fn.labels.reduced <- tempfile()
utils::write.table(labs.reduced, file= fn.labels.reduced, sep="\t",row.names=T, col.names=T, quote=FALSE)
# print(apply(labs.reduced,2,table))
# print(dim(labs))
# print(dim(labs.reduced))
if(flag.debug) { print(table(labs.reduced)) }
# get platypus result list
platypus.result <- platypus(view.list=view.list, fn.labs=fn.labels.reduced, i=n.iters, m=majority.threshold.percent,e=expanded.output,u=updating,b=ignore.label,nfolds=nfolds)
#platypus.result <- platypus(view.list=view.list, fn.labs=fn.labels.reduced, i=n.iters, m=majority.threshold.percent,expanded.output=expanded.output,updating=updating,ignore.label=ignore.label)
# TODO: THIS NEXT CODE BLOCK IS THE BROKEN BIT FOR SSC EXAMPLE
# Error in na.fail.default(list(labels = c(`11031.p1` = 1L, `11033.p1` = 2L, :
# missing values in object
## Predict hold-out data subset with platypus result
test.ids <- rownames(labs[which(labs$fold == k & labs[,classcol.labs] != ignore.label),,drop=F])
predictions <- platypus.predict(platypus.result$final.views, platypus.result$weighting.threshold, test.ids,unique.labels,labs[,classcol.labs,drop=FALSE])
if(flag.debug) { message('Held out data subset predictions made successfully') }
if(flag.debug) { message(utils::head(predictions)) } # TODO: Prints in 1 line, is ugly.
## Calculate final platypus performance for each cv-fold
perf.all.agree <- calculate.performance(predictions[,c("final","category.all")],labs[,classcol.labs,drop=F],unique.labels)
if(flag.debug) { print('Calculated all.agree') }
perf.majority.agree <- calculate.performance(predictions[,c("final","category.majority")],labs[,classcol.labs,drop=F],unique.labels)
if(flag.debug) { print('Calculated majority.agree') }
## Calculate final performance values for each view
perf.views <- list()
accuracy.views <- c()
balanced.accuracy.views <- c()
for(view.i in seq(length(platypus.result$final.views))){
perf.view <- calculate.performance.view(predictions[,view.i,drop=F],labs[,classcol.labs,drop=F],unique.labels)
perf.views[[view.i]] <- perf.view
accuracy.views <- c(accuracy.views,perf.view$accuracy)
balanced.accuracy.views <- c(balanced.accuracy.views,perf.view$balanced.accuracy)
}
accuracy.cvfolds <- c(k,perf.all.agree$accuracy,perf.all.agree$balanced.accuracy,perf.all.agree$coverage
,perf.majority.agree$accuracy,perf.majority.agree$balanced.accuracy,perf.majority.agree$coverage
,accuracy.views, balanced.accuracy.views)
performance.cvfolds <- list(perf.all.agree=perf.all.agree,perf.majority.agree=perf.majority.agree,perf.views=perf.views)
## Calculate performance in each iteration, if expanded output flag is on
if(expanded.output){
iteration.information <- platypus.result$iteration.information
performance.iterations <- list()
accuracy.platypus.iterations <- c()
for(i in seq(length(iteration.information))){
## Predict hold-out data subset with platypus view from iteration
view.list <- iteration.information[[i]]$view.list
test.ids <- rownames(labs[which(labs$fold == k & labs[,classcol.labs] != ignore.label),,drop=F])
predictions <- platypus.predict(view.list, platypus.result$weighting.threshold, test.ids, unique.labels,labs[,classcol.labs,drop=FALSE])
## Calculate performance of the iteration platypus and views
if(flag.debug) { print('Caluculating iteration performances') }
perf.all.agree <- calculate.performance(predictions[,c("final","category.all")],labs[,classcol.labs,drop=F],unique.labels)
perf.majority.agree <- calculate.performance(predictions[,c("final","category.majority")],labs[,classcol.labs,drop=F],unique.labels)
perf.views <- list()
accuracy.views <- c()
balanced.accuracy.views <- c()
weighting.views <- c()
weighting.norm.views <- c()
for(view.i in seq(length(view.list))){
perf.view <- calculate.performance.view(predictions[,view.i,drop=F],labs[,classcol.labs,drop=F],unique.labels)
perf.views[[view.i]] <- perf.view
accuracy.views <- c(accuracy.views,perf.view$accuracy)
balanced.accuracy.views <- c(balanced.accuracy.views,perf.view$balanced.accuracy)
weighting.views <- c(weighting.views,view.list[[view.i]]$acc)
weighting.norm.views <- c(weighting.norm.views,view.list[[view.i]]$acc.norm)
}
accuracy.platypus.iterations <- rbind(accuracy.platypus.iterations
,c(k,i
,iteration.information[[i]]$weighting.threshold.upper,iteration.information[[i]]$weighting.threshold.lower,iteration.information[[i]]$weighting.threshold
,weighting.views,weighting.norm.views
,iteration.information[[i]]$no.ids.labelled,iteration.information[[i]]$no.ids.unlabelled
,iteration.information[[i]]$no.new.labelled, iteration.information[[i]]$no.ids.left.unlabelled
,perf.all.agree$accuracy,perf.all.agree$balanced.accuracy,perf.all.agree$coverage
,perf.majority.agree$accuracy,perf.majority.agree$balanced.accuracy,perf.majority.agree$coverage
,accuracy.views,balanced.accuracy.views
)
)
performance.iterations[[i]] <- list(perf.all.agree=perf.all.agree,perf.majority.agree=perf.majority.agree,perf.views=perf.views)
}
} # end if expanded.output
return.list <- list(accuracy.cvfolds=accuracy.cvfolds,performance.cvfolds=performance.cvfolds)
if(expanded.output){
return.list <- list(accuracy.cvfolds=accuracy.cvfolds,performance.cvfolds=performance.cvfolds,accuracy.platypus.iterations=accuracy.platypus.iterations,performance.iterations=performance.iterations
,labelling.matrix=platypus.result$labelling.matrix,labelling.matrices.views=platypus.result$labelling.matrices.views)
}
if(flag.debug) { print('leaving do.one.cvfold') }
return(return.list)
} # end do.one.cvfold fxn
if(parallel) { cv.result.list <- parallel::mclapply(seq(cv.folds), do.one.cvfold) }
else { cv.result.list <- lapply(seq(cv.folds), do.one.cvfold) }
if(flag.debug) { print('out of cv folds loop') }
## Collect accuracy over cv-iterations
accuracy.cvfolds <- c()
performance.cvfolds <- list()
for(k in seq(cv.folds)){
accuracy.cvfolds <- rbind(accuracy.cvfolds,cv.result.list[[k]]$accuracy.cvfolds)
performance.cvfolds[[k]] <- cv.result.list[[k]]$performance.cvfolds
}
colnames(accuracy.cvfolds) <- c("cv.fold","accuracy.all.agree","balanced.accuracy.all.agree","coverage.all.agree","accuracy.majority.agree","balanced.accuracy.majority.agree","coverage.majority.agree"
,paste0("accuracy.view.",seq(length(view.list))),paste0("balanced.accuracy.view.",seq(length(view.list))))
if(flag.debug) { print('collected cv iteration performances') }
if(!is.na(output.folder)) {
utils::write.table(accuracy.cvfolds,file=file.path(output.folder,"perf_platypus.tab"), sep="\t",row.names=F, col.names=T, quote=FALSE)
save(performance.cvfolds,file=file.path(output.folder,"performance.cvfolds.Rdata") )
}
if(flag.debug) { print('Stored results to file') }
if(expanded.output){
if(flag.debug) { print('Entering expanded.output loop') }
## Collect accuracy over platypus-iterations
accuracy.platypus.iterations <- c()
performance.iterations <- list()
for(k in seq(cv.folds)){
accuracy.platypus.iterations <- rbind(accuracy.platypus.iterations,cv.result.list[[k]]$accuracy.platypus.iterations)
performance.iterations[[k]] <- cv.result.list[[k]]$performance.iterations
}
colnames(accuracy.platypus.iterations) <- c("cv.fold","iteration","weighting.threshold.upper","weighting.threshold.lower","weighting.threshold"
,paste0("weighting.view.",seq(length(view.list))),paste0("weighting.norm.view.",seq(length(view.list)))
,"no.ids.labelled","no.ids.unlabelled"
,"no.new.labelled", "no.ids.left.unlabelled"
,"accuracy.all.agree","balanced.accuracy.all.agree","coverage.all.agree"
,"accuracy.majority.agree","balanced.accuracy.majority.agree","coverage.majority.agree"
,paste0("accuracy.view.",seq(length(view.list))),paste0("balanced.accuracy.view.",seq(length(view.list))))
if(flag.debug) { print('Renamed the results') }
if(!is.na(output.folder)) {
utils::write.table(accuracy.platypus.iterations, file= file.path(output.folder,"perf_platypus_expanded.tab"), sep="\t",row.names=F, col.names=T, quote=FALSE)
save(performance.iterations,file =file.path(output.folder,"performance.iterations.Rdata") )
}
if(flag.debug) { print('Renamed the results') }
labelling.matrix.cvlist <- list()
labelling.matrices.views.cvlist <- list()
for(k in seq(cv.folds)){
labelling.matrix.cvlist[[k]] <- cv.result.list[[k]]$labelling.matrix
labelling.matrices.views.cvlist[[k]] <- cv.result.list[[k]]$labelling.matrices.views
}
if(flag.debug) { print('Stored the labelling matrix') }
if(!is.na(output.folder)) {
save(labelling.matrix.cvlist,file =file.path(output.folder,"labelling.matrix.cvlist.Rdata") )
save(labelling.matrices.views.cvlist,file =file.path(output.folder,"labelling.matrices.views.cvlist.Rdata") )
}
if(flag.debug) { print('Wrote to files the labelling matrix') }
} # end if(expanded.output)
if(expanded.output) return( list(fold.accuracy=accuracy.cvfolds, labelling.matrix=labelling.matrix.cvlist, labelling.matrices.views=labelling.matrices.views.cvlist) )
else return( list(fold.accuracy=accuracy.cvfolds, labelling.matrix=NA, labelling.matrices.views=NA ) )
} # end cv.platypus
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.