R/platypus.basicFunctions.R

Defines functions ElasticNet RandomForest SupportVectorMachine load.parameterfile load.data load.data.ElasticNet load.data.RandomForest load.data.SupportVectorMachine load.label.data.old load.label.data get.unique.labels view.train view.train.ElasticNet view.train.RandomForest view.train.SupportVectorMachine view.predict view.predict.ElasticNet view.predict.RandomForest view.predict.SupportVectorMachine platypus.predict platypus.predict.stacked platypus.predict.ensemble update.accuracies update.accuracy update.accuracy.ElasticNet update.accuracy.RandomForest calculate.accuracy get.majority.weighting get.new.labels.majorityWeighted normalize.accuracy.log calculate.performance calculate.performance.view get.labelling.performance

Documented in load.parameterfile platypus.predict

## Basic Functions for the MVL framework

## Subset to k most variable features
#drop.features <- function(dat, k) {
#  if(ncol(dat) > k ) {
#    vars <- apply(dat, 2, stats::var)
#    vars <- sort(vars, decreasing=TRUE)
#    thresh <- vars[k]
#    dat <- dat[,names(which(vars>=thresh))]
#    write( paste('Shrinking to', k, 'most variable features...\n', ncol(dat)), stdout() )
#    if(ncol(dat) > k) {
#      write( paste('NOTE: Many variables have the same variance, so', ncol(dat), 'variables were kept\n'), stdout() )
#    }
#  } else {
#    write(paste('WARNING: Data matrix already has', k, 'features\n'),stdout() )
#  }
#  return(dat)
#}

## TODO Make a platypus object class, as well as cv.platypus and llv.platypus
# fn.labs, view.list, ignore.label='intermediate', i=100, m=100, u=FALSE, e=FALSE,updating=FALSE,expanded.output=FALSE
#platypus <- function(labs, views, iters=10) {
#  me <- list(
#    iters = iters,
#    labs = labs,
#    views = views
#  )
#  ## Set the name for the class
#  class(me) <- append(class(me),"platypus")
#  return(me)
#}
#  structure(list(views=views,labs=labs,iters=iters), class = "foo")
#}
#platypus('moo', list('oink','baa'))


## create a class for each view-type
ElasticNet <- function(param.file="",data.matrix=c(),data.fn="",alpha=0.9,measure="auc",drop=TRUE,drop.to=5000,model=c(),acc=0.5, acc.norm=0, family='binomial', view.name=NA) {
  me <- list(
    param.file = param.file,
    data.matrix = data.matrix,
    data.fn = data.fn,
    alpha = alpha,
    measure = measure,
    family = family,
    drop = drop,
    drop.to = drop.to,
    model = model,
    acc = acc,
    acc.norm = acc.norm,
    view.name = view.name
  )
  
  ## Set the name for the class
  class(me) <- append(class(me),"ElasticNet")
  return(me)
}

RandomForest <- function(param.file="",data.matrix=c(),data.fn="",mtry="sqrt",ntree=500,drop=TRUE,drop.to=5000,model=c(),acc=0.5,acc.norm=0, view.name=NA) {
  me <- list(
    param.file = param.file,
    data.matrix = data.matrix,
    data.fn = data.fn,   
    mtry = mtry,  
    ntree = ntree,
    drop = drop,
    drop.to = drop.to,
    model = model,
    acc = acc,
    acc.norm = acc.norm,
    view.name = view.name
  )
  
  ## Set the name for the class
  class(me) <- append(class(me),"RandomForest")
  return(me)
}

## TODO: unfinished, untested
SupportVectorMachine <- function(param.file="",data.matrix=c(),data.fn="",model=c(),acc=0.5,accSD=0,C=1, view.name=NA) {
  me <- list(
    param.file = param.file,
    data.matrix = data.matrix,
    data.fn = data.fn,
    C = C,
    model = model,
    acc = acc,
    accSD = accSD,
    view.name = view.name
  )

  ## Set the name for the class
  class(me) <- append(class(me),"SupportVectorMachine")
  return(me)
}

## Read in the parameter file for a view and return a view object
## TODO: Add in option for task name
#' Create a view from a configuration file
#'
#' @param filename File containing config information
#' @param delim Optional delimiter for the file containing view data (not the config file!)
#' @return View object
#' @keywords platypus
#' @export
load.parameterfile <- function(filename, delim='\t'){
 
  # Load parameters file 
  param.table <- utils::read.table(filename, sep='\t',header=FALSE, row.names=1, stringsAsFactors=FALSE)
  
  # Check the type and create a view object, set type-specific parameters
  type <- param.table["type",1]
  if(type == "en"){
    view.object <- ElasticNet(param.file=filename,data.fn=toString(param.table["data.fn",1]))
    if("alpha" %in% rownames(param.table)){ view.object$alpha <- as.numeric(as.character(param.table["alpha",1])) }
    if("measure" %in% rownames(param.table)){ view.object$measure <- as.character(param.table["measure",1]) }
  } else if(type == 'rf'){
    view.object <- RandomForest(param.file=filename,data.fn=toString(param.table["data.fn",1]))
    if("mtry" %in% rownames(param.table)){ view.object$mtry <- as.numeric(as.character(param.table["mtry",1])) }
    if("ntree" %in% rownames(param.table)){ view.object$ntree <- as.numeric(as.character(param.table["ntree",1])) }
  } else if(type == 'svm'){
    message('SVMs are not tested yet, be warned.')
    view.object <- SupportVectorMachine(param.file=filename, data.fn=toString(param.table["data.fn",1]))
    if("C" %in% rownames(param.table)) { view.object$C <- as.numeric(as.character(param.table["ntree",1])) }
  } else { 
    stop(paste("Could not find valid type specification in parameter file:",filename,"; Valid types are: rf, en, svm."))
  }

  # Parameters common to all view types
  if("view.name" %in% rownames(param.table)){ view.object$view.name <- param.table["view.name",1] }
  if("acc" %in% rownames(param.table)){ view.object$acc <- as.numeric(as.character(param.table["acc",1])) }
  if("data.fn" %in% rownames(param.table)){ view.object$data.matrix <- data.matrix( utils::read.table(view.object$data.fn, sep=delim, header=TRUE, row.names=1, check.names=FALSE) ) } # TODO: Do we really want to force everything to be numeric???
#  if("drop" %in% rownames(param.table)){ view.object$drop <- as.logical(param.table["drop",1]) } # TODO: Remove this option
#  if("drop.to" %in% rownames(param.table)){ view.object$drop.to <- as.numeric(as.character(param.table["drop.to",1])) } # TODO: Remove this option

  return(view.object)
}


## load the feature matrix from file to a matrix object
load.data <- function(view.object){

  # read the matrix from the file
  mat <- data.matrix( utils::read.table(view.object$data.fn, sep=delim,header=TRUE, row.names=1, check.names=FALSE) )
  UseMethod("load.data",view.object)
 
 # set the matrix
  view.object$data.matrix <- mat
}
load.data.ElasticNet <- function(view.object,delim='\t'){
  print(paste('data loaded for Elastic Net view',view.object$data.fn))
  return(view.object)
}
load.data.RandomForest <- function(view.object,delim='\t'){
  print(paste('data loaded for Random Forest view',view.object$data.fn))
  # set mtry parameter to sqrt(#features) if default
  if(view.object$mtry == "sqrt"){ view.object$mtry <- floor(sqrt(ncol(mat))) }
  return(view.object)
}
load.data.SupportVectorMachine <- function(view.object,delim='\t'){
  print(paste('data loaded for SVM view',view.object$data.fn))
  return(view.object)
}

## load the label file
## TODO: I updated this to return the full labels matrix, but each fxn will now need to iterate over each column
load.label.data.old <- function(fn.labs,delim='\t'){
  # read file
  labs <- utils::read.table(fn.labs, sep=delim,header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors = FALSE) 
  # take out 'NA' values - only in cases where ALL labels are NA for a given sample
  labs <- labs[!apply(labs,1,function(x){any(is.na(x))}),,drop=FALSE]
  return(labs)
}
load.label.data <- function(fn.labs,classcol.labs, delim='\t'){
  # read file
  labs <- utils::read.table(fn.labs, sep=delim,header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors = FALSE) 
  # take out 'NA' values
  labs <- labs[which(!(is.na(labs[,classcol.labs]))),,drop=FALSE]
  return(labs)
}

## retrieve the two labels for training/predicting
get.unique.labels <- function(label.vec,ignore.label){

  ## Get the the two labels
  unique.labels <- unique(label.vec)
  
  # exclude a label if given by ignore.label, e.g. 'intermediate'
  if(ignore.label %in% unique.labels){ unique.labels <- unique.labels[-c(which(unique.labels == ignore.label))] }

  # exclude label 'testing' - this is introduced by cv.platypus() to mark the test set
  if('testing' %in% unique.labels){ unique.labels <- unique.labels[-c(which(unique.labels == 'testing'))] }

  unique.labels <- as.vector(sort(unique.labels))
  
  # MVL classification for 2 labels
  if(length(unique.labels) != 2){
    print(paste0("WARNING: ",length(unique.labels)," labels for a binary task. Results might not be correct!"))
  }
  return(unique.labels)
}

view.train <- function( labels, view.object, nfolds=10 ) {
  # Intersect IDs in labels and in the feature data
  ids <- intersect(rownames(labels),rownames(view.object$data.matrix))

  # Reduce number of folds if fewer than 10 samples per fold
  if(length(ids)/nfolds < 10) { nfolds <- floor(length(ids)/10); print(paste('WARNING: Using',nfolds,'folds, because not enough samples')) }

  UseMethod("view.train",view.object)
}
view.train.ElasticNet <- function(labels, view.object, nfolds){
  view.object$model <- cv.glmnet( view.object$data.matrix[ids,]
                                  ,as.factor(labels[ids,1]), family=view.object$family
                                  ,type.measure=view.object$measure
                                  ,alpha=view.object$alpha 
                                  ,nfolds=nfolds
                                  ,keep=TRUE)
  return(view.object)
}
view.train.RandomForest <- function( labels, view.object, nfolds ){
  
  view.object$model <- randomForest( view.object$data.matrix[ids,], 
                                     as.factor(labels[ids,1]), 
                                     mtry=view.object$mtry, 
                                     ntree=view.object$ntree )
  return(view.object)
}
view.train.SupportVectorMachine <- function( labels, view.object, nfolds ){
  ## TODO: UNTESTED
# TODO: caret uses kernlab for svmRadialCost, so pick a different kernel. Oops ;)
#  view.object$model <- e1071::svm(labels[ids,1] ~ ., data=view.object$data.matrix[ids,], 
#                                       kernel=view.object$kernel)#, TODO: cost not implemented yet  
#                                       cost=view.object$cost)#,  TODO: gamma not implemented yet
#                                       gamma=view.object$gamma)
  view.object$model <- kernlab::ksvm(x = as.matrix(view.object$data.matrix[ids,]), y = labels[ids,1], kernel = "rbfdot", C = view.object$C) # TODO: copied over from caret for now. 
  return(view.object)
}



## Take a trained view and new labels, return predictions
view.predict <- function(ids.unlabelled, view.object) {
  # Intersect IDs in labels and in the feature data
  ids <- intersect(ids.unlabelled,rownames(view.object$data.matrix))
  UseMethod("view.predict",view.object)
}
view.predict.ElasticNet <- function(ids.unlabelled, view.object) {
  # TODO: update to use caret?
  if(length(ids) > 0){
    return( glmnet::predict.cv.glmnet(view.object$model, view.object$data.matrix[ids,,drop=FALSE], type='class', s='lambda.min') )  # TODO: use lambdaMin or default s="lambda.1se" ?  # TODO: changed to predict.cv.glmnet but not tested, from predict()
  }
  
  return(as.character(c()))
}
view.predict.RandomForest <- function(ids.unlabelled, view.object) {
  if(length(ids) > 0){
    return( as.character( predict(view.object$model, view.object$data.matrix[ids,,drop=FALSE], type='response')  ) )
    #return( as.character( randomForest::predict.randomForest(view.object$model, view.object$data.matrix[ids,,drop=FALSE], type='response')  ) )
  }
  
  return(as.character(c()))
}
view.predict.SupportVectorMachine <- function(ids.unlabelled, view.object) {
  if(length(ids) > 0){
    #return( as.character( e1071::predict.svm(view.object$model, view.object$data.matrix[ids,,drop=FALSE])  ) )
    return( as.character( kernlab::predict(view.object$model, view.object$data.matrix[ids,,drop=FALSE]) ) )
  }

  return(as.character(c()))
}


### PLATYPUS has 2 options for label learning- either using a stacked model or our homebrew optimization function (the ensemble approach)
## TODO: This function should also be something the user can use to make new predictions, given a set of views? If I do that, remove the unique.labels option???
#' Given a trained platypus model, make predictions. Generally used internally by platypus
#'
#' @param view.list List of views to use for predictions
#' @param majority Confidence threshold for predictions
#' @param test.ids Samples to be predicted
#' @param unique.labels Class labels
#' @param labels Labels for known samples
#' @param join.fxn Function used to make group votes
#'
#' @return Matrix of predictions (One per view, overall, etc)
#'
#' @export
platypus.predict <- function(view.list, majority, test.ids,unique.labels,labels,join.fxn='ensemble'){
  if(join.fxn=='ensemble') { return(platypus.predict.ensemble(view.list, majority, test.ids,unique.labels)) }
  else if(join.fxn=='stacked') { return(platypus.predict.stacked(view.list, majority, test.ids,unique.labels,labels)) }
  else { stop("ERROR: Can only predict using 'ensemble' or 'stacked' predict options.") } # Only can handle ensemble/stacked learning for now
}

## 20180709 - Kiley updates to add in stacked learning
## Take a platypus result and predict new labels with it
platypus.predict.stacked <- function(view.list, majority, test.ids,unique.labels,labels){

  ## Debug flag can be manually activated, for testing purposes 
  #flag.debug <- TRUE
  flag.debug <- FALSE 
  if(flag.debug) { print('Debug is on') }

  ## Return values:
  #  Test.ids is the list of sample IDs we are testing.
  #  predictions is the data frame of samples (rows) by views label predictions (eg. sensitive/non-sensitive)
  #  final is a list of outcome label predictions - NOTE: some of these have NA values! Names are sample IDs
  #  category.all is a list of agreed/not.agreed labels. Names are sample IDs
  #  category.majority is also a list of agreed/not.agreed labels. Names are sample IDs
  
  if(flag.debug) { print('platypus.predict') }
  #  predictions is the data frame of samples (rows) by views label predictions (eg. sensitive/non-sensitive)
  ## Make per-view predictions
  predictions <- matrix(data=NA, nrow=length(test.ids), ncol=length(view.list),dimnames=list(test.ids,sapply(view.list, function(x){x$view.name})))
  for(view.i in seq(length(view.list))){
    ids <- intersect(test.ids,rownames(view.list[[view.i]]$data.matrix))
    if(length(ids) > 0){
      predictions[ids,view.i] <- view.predict(ids,view.list[[view.i]])
    }
  } 
  predictions[is.na(predictions)] <- 'not.predicted' # TEMP removing NA values so model predictions work
  if(flag.debug) { print('Created predictions matrix') }

  # samples fall in 3 categories if view predictions are combined in an ensemble
  # agreed: there was enough agreement between the single views to make an ensemble prediction
  # not.agreed: not enough agreement between the single views to make an ensemble prediction
  # not.assessed: not enough data was available in the single views to ever reach an ensemble prediction under the given requirements

  # category.all is a list of agreed/not.agreed labels. Names are sample IDs
  category.all <- rep(NA,length(test.ids))#dim(predictions)[[1]])
  names(category.all) <- test.ids # rownames(predictions)
  category.all[rownames(predictions[which(apply(predictions,1,function(x) sum(table(x))>=length(view.list))),,drop=FALSE])] <- "not.agreed"
  category.all[rownames(predictions[which(apply(predictions,1,function(x) sum(table(x))<length(view.list))),,drop=FALSE])] <- "not.assessed"
  category.all[rownames(predictions[which(apply(predictions,1,function(x) table(x)[1]==length(view.list))),,drop=FALSE])] <- "agreed"
  category.majority <- category.all # TODO: TEMP. This used to be different and part of weighting, not sure going to use in stacked version 
  if(flag.debug) { print('Created category.all and category.majority') }

  ## Build the stacked model, get predictions of samples w/greater or equal to threshodl agreement levels
  #for.model.predictions <- cbind(labels=labels[ids,1], predictions[ids,]) # Combine labels and predictions from each view to train the stacked model
  for.model.predictions <- cbind(labels=labels[rownames(predictions),1], predictions) # Combine labels and predictions from each view to train the stacked model
  #print(paste('Class for.model.predictions',class(for.model.predictions)))
  if(flag.debug) { print(unique(for.model.predictions[,1])) } # Print the unique labels 
  preds.model       <- randomForest( labels ~ ., for.model.predictions, ntree=100,replace=TRUE)
  if(flag.debug) { print('Make stacked model predictions') }
  preds.stckd       <- PLATYPUS::predict(preds.model, predictions, predict.all=TRUE) # TODO: possible to call package without explicitly naming it, in case we change name later?
  #preds.stckd       <- randomForest::predict.randomForest(preds.model, predictions, predict.all=TRUE)
  if(flag.debug) { print('Make preds.stckd') }
  preds.stacked.all <- preds.stckd$individual
  if(flag.debug) { print('Make preds.stckd.indiv') }

  if(flag.debug) { print('Stacked model predictions made') }
  tbl.preds.stckd           <- matrix(NA, nrow=nrow(preds.stacked.all), ncol=2)
  rownames(tbl.preds.stckd) <- rownames(preds.stacked.all)
  colnames(tbl.preds.stckd) <- unique.labels
  tbl.preds.stckd[,1]       <-   apply(preds.stacked.all, 1, function(x) {sum(x==unique.labels[1])} ) 
  tbl.preds.stckd[,2]       <-   apply(preds.stacked.all, 1, function(x) {sum(x==unique.labels[2])} ) 
  tbl.preds.stckd           <- tbl.preds.stckd/ncol(preds.stacked.all)
 
  #category.majority <- predict(preds.model, predictions, predict.all=TRUE)$aggregate #preds.lbls #majority.res.list$category.majority #TODO: updated the typo, not tested yet 
  #category.majority <- preds.stckd$aggregate #preds.lbls #majority.res.list$category.majority #TODO: updated the typo, not tested yet 

  if(flag.debug) { print('Created tbl.preds.stckd') }
  #  final is a list of outcome label predictions - NOTE: some of these have NA values! Names are sample IDs
  final <- rep(NA, length(test.ids))
  names(final) <- test.ids
  final[rownames(preds.stckd)] <- preds.stckd$aggregate # tbl.preds.stckd[apply(tbl.preds.stckd,1,max)>=majority,]
  if(flag.debug) { print('created final') }

  # Reset the NA values from before
  predictions[predictions=='not.predicted'] <- NA # Put these back to NA values
  if(flag.debug) { 
    print(unique(predictions))
    print('Re-Updated predictions matrix')
  }

  ## Debug only
  if(flag.debug) { 
    print('Dimensions of test.ids, predictions, final, category.all, category.majority:')
    print(length(test.ids))
    print(dim(predictions))
    print(length(final)) # TODO: Final in this version is a list, in stacked version is matrix. update!
    print(length(category.all))
    print(length(category.majority))

    print('TEST.IDS:')
    print(utils::head(test.ids))
    print('PREDICTIONS:')
    print(utils::head(predictions))
    print('FINAL:')
    print(utils::head(final))
    print('CATEGORY.ALL:')
    print(utils::head(category.all))
    print('CATEGORY.MAJORITY:')
    print(utils::head(category.majority))
  }

  predictions <- cbind(predictions,final,category.all,category.majority)

  if(flag.debug) { print('leaving platypus.predict') }
  return(predictions)

}

## Old platypus predict function - uses ensemble
platypus.predict.ensemble <- function(view.list, majority, test.ids,unique.labels){
  flag.debug <- FALSE

  predictions <- matrix(data=NA, nrow=length(test.ids), ncol=length(view.list),dimnames=list(test.ids,sapply(view.list, function(x){x$view.name})))
  for(view.i in seq(length(view.list))){
    ids <- intersect(test.ids,rownames(view.list[[view.i]]$data.matrix))
    if(length(ids) > 0){
      predictions[ids,view.i] <- view.predict(ids,view.list[[view.i]]) 
    }
  }

  # samples fall in 3 categories if view predictions are combined in an ensemble
  # agreed: there was enough agreement between the single views to make an ensemble prediction
  # not.agreed: not enough agreement between the single views to make an ensemble prediction
  # not.assessed: not enough data was available in the single views to ever reach an ensemble prediction under the given requirements

  category.all <- rep(NA,dim(predictions)[[1]])
  names(category.all) <- rownames(predictions)
  category.all[rownames(predictions[which(apply(predictions,1,function(x) sum(table(x))>=length(view.list))),,drop=FALSE])] <- "not.agreed"
  category.all[rownames(predictions[which(apply(predictions,1,function(x) sum(table(x))<length(view.list))),,drop=FALSE])] <- "not.assessed"
  category.all[rownames(predictions[which(apply(predictions,1,function(x) table(x)[1]==length(view.list))),,drop=FALSE])] <- "agreed"

  majority.res.list <- get.majority.weighting(view.list,majority,predictions,unique.labels)
  category.majority <- majority.res.list$category.majority
  final <- majority.res.list$final

  ## Debug only
  if(flag.debug) {
    print('Dimensions of test.ids, predictions, final, category.all, category.majority:')
    print(length(test.ids))
    print(dim(predictions))
    print(length(final)) # NOTE: Final in this version is a list, in stacked version is matrix. update!
    print(length(category.all))
    print(length(category.majority))
  
    print('TEST.IDS:')
    print(utils::head(test.ids))
    print('PREDICTIONS:')
    print(utils::head(predictions))
    print('FINAL:')
    print(utils::head(final))
    print('CATEGORY.ALL:')
    print(utils::head(category.all))
    print('CATEGORY.MAJORITY:')
    print(utils::head(category.majority))
  }

  predictions <- cbind(predictions,final,category.all,category.majority)
  return(predictions)

}

## Adjusting view accuracy in each platypus iteration to ensure correct weighting
## Calculate the current accuracy of each view by looking at the originally known labels predicted during training of the view (by elastic net cv or random forest oob)
## The views are trained on the known and newly learned labels, to get the change in accuracy by adding new labels
update.accuracies <- function(view.list,known.labels){
  for(view.i in seq(length(view.list))){
    view <- view.list[[view.i]]
    view <- update.accuracy(view,known.labels)
    view.list[[view.i]] <- view
  print( paste('Updating view accuracies', view) )
  }
  
  return(view.list)
}
update.accuracy <- function(view, known.labels){
  UseMethod("update.accuracy",view)
}
## method predicts on samples also used in training to avoid manual cross-validation
## assessment of prediction accuracy of the view is therefore highly overestimated
## TODO: Make sure these update methods are correct
update.accuracy.ElasticNet <- function(view, known.labels){
  
  predictions <- view.predict(rownames(known.labels),view)
  label.tab <- merge(known.labels, as.data.frame(predictions), by.x="row.names", by.y="row.names", all=TRUE, sort=TRUE) 
  
  acc.list <- calculate.accuracy(label.tab,known.labels)
  view$acc <- acc.list$b.acc
  view$acc.norm <- normalize.accuracy.log(acc.list$b.acc)
  
  return(view)
}
## used oob predictions of random forest - correct estimation of model accuracy
update.accuracy.RandomForest <- function(view, known.labels){
  oob.predictions <- view$model$predicted
  label.tab <- merge(known.labels, as.data.frame(oob.predictions), by.x="row.names", by.y="row.names", all=TRUE, sort=TRUE) 
  
  acc.list <- calculate.accuracy(label.tab,known.labels)
  view$acc <- acc.list$b.acc
  view$acc.norm <- normalize.accuracy.log(acc.list$b.acc)
  
  return(view)
}
## TODO: update.accuracy.svm function needs creation

## TODO: Double check this is correct. Why is it so complicated???
calculate.accuracy <- function(label.tab,known.labels){
  correct <- c()
  total <- c()
  for(label in as.character(unique(known.labels[,1]))){
    c <- length(which(label.tab[,2] == label & label.tab[,2] == label.tab[,3]))
    correct <- c(correct,c)
    t <- length(which(label.tab[,2] == label))
    total <- c(total,t)
  }
  
  acc <- sum(correct) / sum(total)
  b.acc <- ( ( correct[1] / total[1] ) + (correct[2] / total[2] ) )/2
  return(list(b.acc=b.acc,acc=acc))
}

## get ensemble predictions for a majority setup - weighting votes by accuracy
# TODO: Update this to work with a stacked model version
get.majority.weighting <- function(view.list,majority,predictions,unique.labels){
  
  category.majority <- rep(NA,dim(predictions)[[1]])
  names(category.majority) <- rownames(predictions)
  final <- rep(NA,dim(predictions)[[1]])
  names(final) <- rownames(predictions)
  
  for(i in seq(length(category.majority))){
    p1 <- 0
    p2 <- 0
    for(view.i in seq(length(view.list))){
      if(!(is.na(predictions[i,view.i]))){
        if(predictions[i,view.i] == unique.labels[1]){
          p1 <- p1 + view.list[[view.i]]$acc.norm
        }
        if(predictions[i,view.i] == unique.labels[2]){
          p2 <- p2 + view.list[[view.i]]$acc.norm
        }
      }
    }
    if(p1 + p2 < majority){
      category.majority[i] <- "not.assessed"
    } else {
      if(p1 < majority & p2 < majority){
        category.majority[i] <- "not.agreed"
      }
    }
    if(p1 > p2 & p1 > majority){
      category.majority[i] <- "agreed"
      final[i] <- unique.labels[1]
    }
    if(p2 > p1 & p2 > majority){
      category.majority[i] <- "agreed"
      final[i] <- unique.labels[2]
    }
    
  }
  
  return(list(category.majority=category.majority,final=final))
}

# weigh the predictions by accuracy of the single views
# TODO: Update this to work with a stacked model version
get.new.labels.majorityWeighted <- function(predictions,view.list,unique.labels){

  # get normalized accuracies of the views
  acc.norm <- c()
  for(view.i in seq(length(view.list))){
    acc.norm <- c(acc.norm,view.list[[view.i]]$acc.norm)
  }
  
  # sum up the accuracies for both labels
  p1.vec <- apply(predictions,1,function(x) 
    sum(acc.norm[which(x == unique.labels[1])])
  )
  p2.vec <- apply(predictions,1,function(x) 
    sum(acc.norm[which(x == unique.labels[2])])
  )
  
  # retrieve the maximal reached sum of accuracies and get all the predictions reaching it
  max.pred.value <- max(c(p1.vec,p2.vec))
  max.preds <- cbind(predictions, p1.vec, p2.vec)
  
  max.preds <- max.preds[which(max.preds[,"p1.vec"] >= max.pred.value | max.preds[,"p2.vec"] >= max.pred.value),,drop=FALSE]
  
  # from those, retrieve the predictions with the minimal value for a contrary prediction
  min.pred.value <- min(c(max.preds[,"p1.vec"],max.preds[,"p2.vec"]))
  
  min.max.preds <- max.preds[which(max.preds[,"p1.vec"] <= min.pred.value | max.preds[,"p2.vec"] <= min.pred.value),,drop=FALSE]
  
  # return them as the new labels
  new.labelled <- min.max.preds[,1,drop=FALSE]
  new.labelled[which(min.max.preds[,"p1.vec"] > min.max.preds[,"p2.vec"]),1] <- unique.labels[1]
  new.labelled[which(min.max.preds[,"p2.vec"] > min.max.preds[,"p1.vec"]),1] <- unique.labels[2]
  

  return(list(new.labelled=new.labelled, weighting.threshold.upper=max.pred.value, weighting.threshold.lower=min.pred.value))
  
}

## normalize accuracies from [0.5,1] to [0,1] and log-transform them
#normalize.accuracies <- function(view.list){
#  for(view.i in seq(length(view.list))){
#    view.list[[view.i]]$acc.norm <- normalize.accuracy.log(view.list[[view.i]]$acc)
#  }
#  return(view.list)
#}

# normalize to range [0,1] and log-transform
normalize.accuracy.log <- function(acc){
  if(is.na(acc)) {
    stop(paste0("ERROR: Non-numeric accuracy value:",acc))
  }
  if(acc >= 0 & acc <= 1){
    if(acc <= 0.5) { 
      acc <- 0.51 # Don't want negative values
    }
    if(acc == 1){
      acc <- 0.99  ## avoids to get Inf-values
    }
    acc.norm <- (acc - 0.5) / (1-0.5)
    acc.norm <- -log(1-acc.norm)
  } else{
    stop(paste0("ERROR: Accuracy value not between 0.5 and 1"))
  }
  return(acc.norm)
}

## calculate performance values for a prediction ( used in cv.platypus )
## TODO: update this function
calculate.performance <- function(predictions,labels,unique.labels){
  labels <- as.character(labels[rownames(predictions),])
  correct <- rep(0,dim(predictions)[[1]])
  names(correct) <- rownames(predictions)
  correct[which(labels == predictions[,1])] <- 1
  
  predictions <- cbind(predictions,labels, correct)

  predictions.assessed <- predictions[which(predictions[,2] != "not.assessed"),,drop=FALSE]
  predictions.agreed <- predictions[which(predictions[,2] == "agreed"),,drop=FALSE]
  
  acc <- dim(predictions.agreed[which(predictions.agreed[,"correct"] == 1),,drop=FALSE])[[1]] / dim(predictions.agreed)[[1]]
  cov <- dim(predictions.agreed)[[1]] / dim(predictions.assessed)[[1]]
  
  # calculate balanced accuracy
  if(dim(predictions.agreed[which(predictions.agreed[,"labels"] == unique.labels[1]),,drop=FALSE])[[1]] != 0){
    sens <- dim(predictions.agreed[which(predictions.agreed[,"correct"] == 1 & predictions.agreed[,"labels"] == unique.labels[1]),,drop=FALSE])[[1]] / dim(predictions.agreed[which(predictions.agreed[,"labels"] == unique.labels[1]),,drop=FALSE])[[1]]
  } else{
    sens <- 0
  }
  if(dim(predictions.agreed[which(predictions.agreed[,"labels"] == unique.labels[2]),,drop=FALSE])[[1]] != 0){
    spec <- dim(predictions.agreed[which(predictions.agreed[,"correct"] == 1 & predictions.agreed[,"labels",drop=FALSE] == unique.labels[2]),,drop=FALSE])[[1]] / dim(predictions.agreed[which(predictions.agreed[,"labels"] == unique.labels[2]),,drop=FALSE])[[1]]
  } else{
    spec <- 0
  }
  
  bal.acc <- ( sens + spec ) / 2
  
  no.correct <- length(which(predictions.agreed[,"correct"] == 1))
  correct.classes <- c()
  for(class in unique.labels){
    correct.classes <- c(correct.classes,length(which(predictions.agreed[,"correct"] == 1 & predictions.agreed[,1] == class)))
  }
  names(correct.classes) <- unique.labels
  no.incorrect <- length(which(predictions.agreed[,"correct"] == 0))
  incorrect.classes <- c()
  for(class in unique.labels){
    incorrect.classes <- c(incorrect.classes,length(which(predictions.agreed[,"correct"] == 0 & predictions.agreed[,1] == class)))
  }
  names(incorrect.classes) <- unique.labels
  no.not.agreed <- dim(predictions.assessed)[[1]] - dim(predictions.agreed)[[1]]
  miss <- dim(predictions)[[1]] - dim(predictions.assessed)[[1]]
  
  return(list(accuracy=acc, coverage=cov, sensitivity=sens, specificity=spec, balanced.accuracy=bal.acc, correct=no.correct, correct.classes = correct.classes
              , incorrect=no.incorrect, incorrect.classes=incorrect.classes, not.agreed=no.not.agreed, missingData=miss))
}

## calculate performance values for each view
calculate.performance.view <- function(predictions,labels,unique.labels){
  labels <- as.character(labels[rownames(predictions),])
  correct <- rep(0,dim(predictions)[[1]])
  names(correct) <- rownames(predictions)
  correct[which(labels == predictions[,1])] <- 1
  
  predictions <- cbind(predictions,labels, correct)
  
  predictions.assessed <- predictions[which(!(is.na(predictions[,1]))),]
  
  acc <- dim(predictions.assessed[which(predictions.assessed[,"correct"] == 1),,drop=FALSE])[[1]] / dim(predictions.assessed)[[1]]
  # calculate balanced accuracy
  if(dim(predictions.assessed[which(predictions.assessed[,"labels"] == unique.labels[1]),,drop=FALSE])[[1]] != 0){
    sens <- dim(predictions.assessed[which(predictions.assessed[,"correct"] == 1 & predictions.assessed[,"labels"] == unique.labels[1]),,drop=FALSE])[[1]] / dim(predictions.assessed[which(predictions.assessed[,"labels"] == unique.labels[1]),,drop=FALSE])[[1]]
  } else{
    sens <- 0
  }
  if(dim(predictions.assessed[which(predictions.assessed[,"labels"] == unique.labels[2]),,drop=FALSE])[[1]] != 0){
    spec <- dim(predictions.assessed[which(predictions.assessed[,"correct"] == 1 & predictions.assessed[,"labels"] == unique.labels[2]),,drop=FALSE])[[1]] / dim(predictions.assessed[which(predictions.assessed[,"labels"] == unique.labels[2]),,drop=FALSE])[[1]]
  } else{
    spec <- 0
  }
  
  bal.acc <- ( sens + spec ) / 2
  
  no.correct <- length(which(predictions.assessed[,"correct"] == 1))
  correct.classes <- c()
  for(class in unique.labels){
    correct.classes <- c(correct.classes,length(which(predictions.assessed[,"correct"] == 1 & predictions.assessed[,1] == class)))
  }
  names(correct.classes) <- unique.labels
  no.incorrect <- length(which(predictions.assessed[,"correct"] == 0))
  incorrect.classes <- c()
  for(class in unique.labels){
    incorrect.classes <- c(incorrect.classes,length(which(predictions.assessed[,"correct"] == 0 & predictions.assessed[,1] == class)))
  }
  names(incorrect.classes) <- unique.labels
  miss <- dim(predictions)[[1]] - dim(predictions.assessed)[[1]]
  
  return(list(accuracy=acc, sensitivity=sens, specificity=spec, balanced.accuracy=bal.acc, correct=no.correct, correct.classes = correct.classes
              , incorrect=no.incorrect, incorrect.classes=incorrect.classes, missingData=miss))
  
  
}


## calculate performance values for platypus label learning validation (llv.platypus)
get.labelling.performance <- function(labelling.matrix,labels,unique.labels){
  
  ids <- intersect(rownames(labels),rownames(labelling.matrix))
  labelling.matrix <- labelling.matrix[ids,,drop=FALSE]
  labels <- labels[ids,,drop=FALSE]
  
  result.table <- c()
  for(i in seq(dim(labelling.matrix)[[2]])){
    vec <- labelling.matrix[,i] == labels[,1]
    acc <- length(vec[!(is.na(vec)) & vec==TRUE])/length(vec[!(is.na(vec))])
    cov <- length(vec[!(is.na(vec))]) / length(vec)
    
    unique.labels <- unique(labels[,1])
    acc.labels <- c()
    cov.labels <- c()
    for(l in seq(length(unique.labels))){
      pred.pos <- rownames(labelling.matrix[which(labelling.matrix[,i] == unique.labels[l]),,drop=FALSE])
      
      is.pos <- rownames(labels[which(labels[,1] == unique.labels[l]),,drop=FALSE])
      
      pred.na <- rownames(labelling.matrix[is.na(labelling.matrix[,i]),,drop=FALSE])
      
      if(length(is.pos[!(is.pos %in% pred.na)]) != 0){
        acc.label <- length(intersect(pred.pos,is.pos))/ length(is.pos[!(is.pos %in% pred.na)])
      } else{
        acc.label <- 0
      }
      
      acc.labels <- c(acc.labels,acc.label )
      cov.labels <- c(cov.labels, 1-(length(intersect(pred.na, is.pos)) / length(is.pos) ))
    }
    
    bal.acc <- mean(acc.labels)
    
    result.table <- rbind(result.table, c(i,acc,bal.acc,cov,acc.labels,cov.labels))
    colnames(result.table) <- c("iteration","accuracy","balanced.accuracy","coverage",paste0(unique.labels,"_accuracy"),paste0(unique.labels,"_coverage"))
  }
  return(result.table)
}
graim/PLATYPUS documentation built on Oct. 4, 2019, 2:05 p.m.