##################################################################################################
### Given expanded output from CV tests, generate plots showing AUC change & labels learned
######################################################################################
## Arguments:
## fn.labs = filename of the outcome labels
## folder = output directory name
## plot.single.cvfolds = whether or not to plot lines for each cv fold, in addition to the genearl plots
################################################################
## TODO: Shouldn't I have this take in the RData object, isntead of loading a file? That way the example can run cv.platypus then call this using the results
#' Given expanded output from CV tests, generate plots showing AUC change & labels learned.
#'
#' @param fn.labs File containing outcome labels
#' @param folder Output directory where results will be written and cv.platypus results already are
#' @param plot.single.cvfolds Whether or not to plot lines for each cv fold, in addition to the general line plot
#'
#' @return Nothing. Plots are printed to file in user-provided folder
#'
#' @export
plot.cv <- function(fn.labs, folder, plot.single.cvfolds=FALSE) {
## Check arguments exist
if(!file.exists(fn.labs)) { stop(paste("ERROR: Labels file does not exist:",fn.labs)) }
if(!file.exists(folder)) { stop(paste("ERROR: Folder does not exist:",folder)) }
## Don't plot each cv fold
#plot.single.cvfolds <- FALSE
## Set up some information & fn.cv.resultss
fn.cv.results <- file.path(folder,'perf_platypus_expanded.tab')
## Gracefully quit if the requisite output file is missing
if(!file.exists(fn.cv.results)){ stop('ERROR: missing PLATYPUS CV performance file') }
accuracy.platypus.iterations <- read.table(fn.cv.results, sep='\t',header=T)
## Load the labels, remove unlabeled samples
labs <- read.table(fn.labs, sep='\t',header=T, row.names=1, check.names=F, stringsAsFactors = FALSE)
labs <- labs[which(!(is.na(labs[,1]))),,drop=F]
compound <- colnames(labs)[1] # Grab the name of the prediction task from the column name
no.views <- length(grep('balanced.accuracy.view.', colnames(accuracy.platypus.iterations))) # Have this value for all views, so pull num views from that
## Set output plot name
pdf.name <- paste('cv_performance',compound, no.views, 'views.pdf', sep='_')
pdf.name <- file.path(folder,pdf.name)
cv.folds <- max(accuracy.platypus.iterations$cv.fold)
## calculate no. correct predictions from accuracy and coverage for all.agree and majority.agree
no.correctly.predicted.balanced.all <- accuracy.platypus.iterations$coverage.all.agree * accuracy.platypus.iterations$balanced.accuracy.all.agree
accuracy.platypus.iterations <- cbind(accuracy.platypus.iterations,no.correctly.predicted.balanced.all)
no.correctly.predicted.balanced.majority <- accuracy.platypus.iterations$coverage.majority.agree * accuracy.platypus.iterations$balanced.accuracy.majority.agree
accuracy.platypus.iterations <- cbind(accuracy.platypus.iterations,no.correctly.predicted.balanced.majority)
## extend accuracy data table to maximal number of iterations
no.iterations <- max(accuracy.platypus.iterations$iteration)
data.extended <- c()
for(cv in 1:cv.folds){
iterations.in.fold <- max(accuracy.platypus.iterations[accuracy.platypus.iterations$cv.fold==cv,"iteration"])
accuracy.platypus.iterations.cvfold <- accuracy.platypus.iterations[which(accuracy.platypus.iterations$cv.fold == cv),]
# fill in the empty iterations with the result from the last iteration
if(iterations.in.fold < no.iterations){
for(i in (iterations.in.fold + 1):no.iterations){
accuracy.platypus.iterations.cvfold <- rbind(accuracy.platypus.iterations.cvfold,accuracy.platypus.iterations.cvfold[dim(accuracy.platypus.iterations.cvfold)[[1]],])
} # end for no.iterations
} # end if
data.extended <- rbind(data.extended,accuracy.platypus.iterations.cvfold)
} # end for cv.folds
# accuracy of MVL
platypus.acc.avg.all <- c()
platypus.acc.avg.majority <- c()
platypus.acc.sd.all <- c()
platypus.acc.sd.majority <- c()
for(i in 1:no.iterations){
acc.vec.all <- c()
acc.vec.majority <- c()
for(cv in 1:cv.folds){
acc.vec.all <- c(acc.vec.all,data.extended[which(data.extended$cv.fold==cv),"balanced.accuracy.all.agree"][i])
acc.vec.majority <- c(acc.vec.majority,data.extended[which(data.extended$cv.fold==cv),"balanced.accuracy.majority.agree"][i])
} # end for cv.folds
platypus.acc.avg.all <- c(platypus.acc.avg.all, mean(acc.vec.all))
platypus.acc.sd.all <- c(platypus.acc.sd.all, sd(acc.vec.all))
platypus.acc.avg.majority <- c(platypus.acc.avg.majority, mean(acc.vec.majority))
platypus.acc.sd.majority <- c(platypus.acc.sd.majority, sd(acc.vec.majority))
} # end for no.iterations
## accuracy of views
views.acc.avg <- c()
views.acc.sd <- c()
for(view.i in 1:no.views){
#for(view.i in 1:length(grep("balanced.accuracy.view.", colnames(data.extended)))){ #TODO
view.acc.avg <- c()
view.acc.sd <- c()
for(i in 1:no.iterations){
acc.vec <- c()
for(cv in 1:cv.folds){
acc.vec <- c(acc.vec,data.extended[which(data.extended$cv.fold==cv),paste0("balanced.accuracy.view.",view.i)][i])
} # end for cv.folds
view.acc.avg <- c(view.acc.avg, mean(acc.vec))
view.acc.sd <- c(view.acc.sd, sd(acc.vec))
} # end for no.iterations
views.acc.avg <- cbind(views.acc.avg,view.acc.avg)
views.acc.sd <- cbind(views.acc.sd,view.acc.sd)
} # end for view.
## prediction coverage
platypus.cov.avg.all <- c()
platypus.cov.avg.majority <- c()
platypus.cov.sd.all <- c()
platypus.cov.sd.majority <- c()
for(i in 1:no.iterations){
cov.vec.all <- c()
cov.vec.majority <- c()
for(cv in 1:cv.folds){
cov.vec.all <- c(cov.vec.all,data.extended[which(data.extended$cv.fold==cv),"coverage.all.agree"][i])
cov.vec.majority <- c(cov.vec.majority,data.extended[which(data.extended$cv.fold==cv),"coverage.majority.agree"][i])
} # end for cv.folds
platypus.cov.avg.all <- c(platypus.cov.avg.all, mean(cov.vec.all))
platypus.cov.sd.all <- c(platypus.cov.sd.all, sd(cov.vec.all))
platypus.cov.avg.majority <- c(platypus.cov.avg.majority, mean(cov.vec.majority))
platypus.cov.sd.majority <- c(platypus.cov.sd.majority, sd(cov.vec.majority))
} # end for no.iterations
## inclusion of unlabelled data in training
platypus.inclusion.avg <- c()
platypus.inclusion.sd <- c()
for(i in 1:no.iterations){
inclusion.vec <- c()
for(cv in 1:cv.folds){
inclusion.vec <- c(inclusion.vec,data.extended[which(data.extended$cv.fold==cv),"no.ids.labelled"][i])
} # end for cv.folds
platypus.inclusion.avg <- c(platypus.inclusion.avg, mean(inclusion.vec))
platypus.inclusion.sd <- c(platypus.inclusion.sd, sd(inclusion.vec))
} # end for no.iterations
# thresholds of MVL labelling
ll.upper.avg <- c()
ll.upper.sd <- c()
for(i in 1:no.iterations){
upper.vec <- c()
for(cv in 1:cv.folds){
upper.vec <- c(upper.vec,data.extended[which(data.extended$cv.fold==cv),"weighting.threshold.upper"][i])
} # end for cv.folds
ll.upper.avg <- c(ll.upper.avg, mean(upper.vec))
ll.upper.sd <- c(ll.upper.sd, sd(upper.vec))
} # end for no.iterations
ll.lower.avg <- c()
ll.lower.sd <- c()
for(i in 1:no.iterations){
lower.vec <- c()
for(cv in 1:cv.folds){
lower.vec <- c(lower.vec,data.extended[which(data.extended$cv.fold==cv),"weighting.threshold.lower"][i])
#lower.vec <- c(lower.vec,data.extended[which(data.extended[,"cv.fold"] == cv),"weighting.threshold.lower"][i])
} # end for cv.folds
ll.lower.avg <- c(ll.lower.avg, mean(lower.vec))
ll.lower.sd <- c(ll.lower.sd, sd(lower.vec))
} # end for no.iterations
#############
# PLOTTING
#############
## Create the pdf where all plots are saved
pdf(file=pdf.name,width=8,height=7)
green.transparent <- rgb(col2rgb("green")[1,1],col2rgb("green")[2,1],col2rgb("green")[3,1],100,maxColorValue=255)
par(mar=c(4, 4, 0, 0) + 0.1,par(xaxs='i',yaxs='i'))
layout(as.matrix(rbind(1,2,3)), widths=c(1,1,1), heights=c(1,1,1.25))
par(mar=c(1, 4, 0.6, 0.1) + 0.1)
## prediction accuracy plot
plot(1:no.iterations,platypus.acc.avg.all,ylim=c(0.5,1),xlim=c(1,no.iterations),type='l',lwd=2,col="red3",xlab="",ylab="CV Accuracy",xaxt='n')
grid()
if(plot.single.cvfolds){
for(cv in 1:cv.folds){
lines(1:no.iterations,data.extended[which(data.extended$cv.fold==cv),"balanced.accuracy.all.agree"],type='l',col="tomato")
} # end for cv.folds
} # end if
upper <- platypus.acc.avg.majority + platypus.acc.sd.majority
lower <- platypus.acc.avg.majority - platypus.acc.sd.majority
lightblue.transparent <- rgb(col2rgb("lightblue")[1,1],col2rgb("lightblue")[2,1],col2rgb("lightblue")[3,1],100,maxColorValue=255)
polygon(c(1:no.iterations, rev(1:no.iterations)), c(upper, rev(lower)), col = lightblue.transparent, border = NA)
if(plot.single.cvfolds){
for(cv in 1:cv.folds){
lines(1:no.iterations,data.extended[which(data.extended$cv.fold==cv),"balanced.accuracy.majority.agree"],type='l',col="lightblue")
} # end for
} # end if
#red polygon again
upper <- platypus.acc.avg.all + platypus.acc.sd.all
lower <- platypus.acc.avg.all - platypus.acc.sd.all
tomato.transparent <- rgb(col2rgb("tomato")[1,1],col2rgb("tomato")[2,1],col2rgb("tomato")[3,1],100,maxColorValue=255)
polygon(c(1:no.iterations, rev(1:no.iterations)), c(upper, rev(lower)), col = tomato.transparent, border = NA)
#single views
for(view.i in 1:dim(views.acc.avg)[[2]]){
lines(1:no.iterations,views.acc.avg[,view.i],pch=20,col=green.transparent,lwd=2)
} # end for
# lines again
lines(1:no.iterations,platypus.acc.avg.majority,col="blue",lwd=2)
lines(1:no.iterations, platypus.acc.avg.all, col = 'red3',lwd=2)
legend("bottomright",legend=c("all agree","majority agrees","single views"), lty=c(1,1,1),lwd=c(2,2,2), col=c("red3","blue","green"), bty='n')
## inclusion of unlabelled data plot
par(mar=c(1, 4, 0.6, 0.1) + 0.1)
plot(1:no.iterations,platypus.inclusion.avg,ylim=c(0,max(platypus.inclusion.avg)),type='l',lwd=2,col="black",xlab="Iterations",ylab="# Labelled Samples",xaxt='n')
grid()
upper <- platypus.inclusion.avg + platypus.inclusion.sd
lower <- platypus.inclusion.avg - platypus.inclusion.sd
polygon(c(1:no.iterations, rev(1:no.iterations)), c(upper, rev(lower)), col = "grey", border = NA)
lines(1:no.iterations,platypus.inclusion.avg,col="black",lwd=2)
if(plot.single.cvfolds){
for(cv in 1:cv.folds){
lines(1:no.iterations,data.extended[which(data.extended$cv.fold==cv),"no.ids.labelled"],type='l',col="grey")
lines(1:no.iterations,platypus.inclusion.avg,ylim=c(0,max(platypus.inclusion.avg)),type='l',lwd=2,col="black") # Re-plot black line so it's on top
} # end for
} # end if
### Threshold plot
par(mar=c(4, 4, 0.6, 0.1) + 0.1)
magenta.transparent <- rgb(col2rgb("magenta")[1,1],col2rgb("magenta")[2,1],col2rgb("magenta")[3,1],100,maxColorValue=255)
plot(1:no.iterations,ll.upper.avg,ylim=c(0,max(ll.upper.avg+ll.upper.sd)),type='l',lwd=2,col="magenta3",xlab="Iteration",ylab="Voting Sum")
grid()
upper <- ll.upper.avg + ll.upper.sd
lower <- ll.upper.avg - ll.upper.sd
polygon(c(1:no.iterations, rev(1:no.iterations)), c(upper, rev(lower)), col = magenta.transparent, border = NA)
lines(1:no.iterations,ll.upper.avg,col="red3",lwd=2)
if(plot.single.cvfolds){
for(cv in 1:cv.folds){
lines(1:no.iterations,data.extended[which(data.extended$cv.fold==cv),"weighting.threshold.upper"],type='l',col="tomato")
} # end for
lines(1:no.iterations, ll.upper.avg, col = 'red3',lwd=2)
} # end if
#lower
upper <- ll.lower.avg + ll.lower.sd
lower <- ll.lower.avg - ll.lower.sd
lightblue.transparent <- rgb(col2rgb("lightblue")[1,1],col2rgb("lightblue")[2,1],col2rgb("lightblue")[3,1],100,maxColorValue=255)
polygon(c(1:no.iterations, rev(1:no.iterations)), c(upper, rev(lower)), col = green.transparent, border = NA)
if(plot.single.cvfolds){
for(cv in 1:cv.folds){
lines(1:no.iterations,data.extended[which(data.extended$cv.fold==cv),"weighting.threshold.lower"],type='l',col="lightblue")
} # end for
} # end if
lines(1:no.iterations,ll.lower.avg,type='l',lwd=2,col="green3")
legend("left",legend=c("min. voting for favored label","max. voting for contrary label"), lty=c(1,1),lwd=c(2,2), col=c("magenta3","green3"), bty='n')
dev.off()
print(paste('Finished! Success. Plot saved as:', pdf.name))
} # end plot.cv function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.