R/stability.lib.R

Defines functions getGraph_features f pop_better sim_inter sim_intra getGraph AnalyseStableModels_LOO LPO_best_models bestModelFeatureStability bestModelStability

Documented in AnalyseStableModels_LOO bestModelFeatureStability bestModelStability getGraph LPO_best_models sim_inter sim_intra

#' analyse stability of models from digest
#'
#' @description This function analyses prevalence of features of best model of different sparsity in crossval (here still k-folds)
#' @param X: dataset to classify
#' @param y: variable to predict
#' @param clf: an object containing the different parameters of the classifier
#' @param digested.result: the digest result from digest
#' @param method: wether to compute the stability of the best compared to the best in the folds (exact), or the top best (fuzzy)
#' @return an object with first a list of feature presence tables for each k_sparsity and a list of feature presence frequency
#' @export
bestModelStability <- function(X, y, clf, digested.result, method = "exact")
{

  # TODO sanity checks
  if(!(method == "exact" | method == "fuzzy"))
  {
    stop("bestModelStability: the method is unknown, should be one of c(exact/fuzzy)")
  }

  res <- list()
  # for all the k_sparse best models
  for(j in 1:length(digested.result$best$models))
  {
    if (isModel(digested.result$best$models[[j]])) {
      best.model <- digested.result$best$models[[j]]

    } else{
      best.model <- digested.result$best$models[[j]][[1]] #get the first best
    }

    acc <- best.model$accuracy_
    ci <- conf.inter(acc,dim(X)[2])
    if(clf$params$verbose) printModel(mod = best.model, method = clf$params$print_ind_method, score = "fit_")

    # convert to dense format
    best.model.dense <- modelToDenseVec(natts = nrow(X), mod = best.model)

    if(method == "exact")
    {
      k_sparse <- paste("k", best.model$eval.sparsity, sep = "_")
      # for all n-folds
      selected <- rep(0,length(digested.result$cv$nfold))
      for (i in 1:length(digested.result$cv$nfold))
      {
        mod.list <- digested.result$cv$nfold[[i]]$results[[k_sparse]]
        n <- length(mod.list)
        mod.list <- lapply(mod.list,function(x){
          if(x$fit_ > conf.inter(best.model$fit_,n))
            return(x)
        })
        mod.list <- mod.list[!unlist(lapply(mod.list, is.null))]
        for(mod in mod.list){
          #mod <- digested.result$cv$nfold[[i]]$results[[k_sparse]][[1]]
          if(!is.null(mod)){
            mod <- modelToDenseVec(natts = nrow(X), mod = mod)
            if(length(which(abs(best.model.dense)-abs(mod)==0))==nrow(X))
              selected[i] <- 1
          }
        }

      }
    }else # if fuzzy
    {
      k_sparse <- paste("k", best.model$eval.sparsity, sep = "_")
      # for all n-folds
      selected <- c()
      p<-1
      for (i in 1:length(digested.result$cv$nfold))
      {
        # get all of them
        for (s in 1:length(digested.result$cv$nfold[[i]]$results[[k_sparse]]))
        {
          mod <- digested.result$cv$nfold[[i]]$results[[k_sparse]][[s]]
          if(!is.null(mod)){
            mod <- modelToDenseVec(natts = nrow(X), mod = mod)
            if (length(which(abs(best.model.dense) - abs(mod) == 0)) == nrow(X)) {
              selected[p] <- 1
            } else{
              selected[p] <- 0
            }}else{
              selected[p] <- 0
            }
          p <- p + 1
        }
      }
    }
    res[[j]] <- selected
  } # end k_sparsity

  s <- unlist(lapply(res,function(x)sum(x)/length(x)))

  return(list(frequency=s, presence=res))
}



#' analyse stability of models from digest
#'
#' @description This function analyses prevalence of features of best model of different sparsity in crossval (here still k-folds) 
#' @param X: dataset to classify
#' @param y: variable to predict
#' @param clf: an object containing the different parameters of the classifier
#' @param digested.result: the digest result from digest
#' @param method: wether to compute the stability of the best compared to the best in the folds (exact), or the top best (fuzzy)
#' @return an object with first a list of feature presence tables for each k_sparsity and a list of feature presence frequency
#' @export
bestModelFeatureStability <- function(X, y, clf, digested.result, method = "fuzzy") 
{
  
  # TODO sanity checks
  if(!(method == "exact" | method == "fuzzy"))
  {
    stop("bestModelFeatureStability: the method is unknown, should be one of c(exact/fuzzy)")
  }
  
  res <- list()
  # for all the k_sparse best models
  for(j in 1:length(digested.result$best$models)) 
  {
    best.model <- digested.result$best$models[[j]] #get the first best
    if(clf$params$verbose) printModel(mod = best.model, method = clf$params$print_ind_method, score = "fit_")
    
    # convert to dense format
    best.model.dense <- modelToDenseVec(natts = nrow(X), mod = best.model)
    
    if(method == "exact")
    {
      k_sparse <- paste("k", best.model$eval.sparsity, sep = "_")
      # for all n-folds
      selected <- list()
      for (i in 1:length(digested.result$cv$nfold)) 
      {
        selected[[(length(selected)+1)]] <- digested.result$cv$nfold[[i]]$results[[k_sparse]][[1]]
      }
    }else # if fuzzy
    {
      k_sparse <- paste("k", best.model$eval.sparsity, sep = "_")
      # for all n-folds
      selected <- list()
      for (i in 1:length(digested.result$cv$nfold)) 
      {
        # get all of them
        for (s in 1:length(digested.result$cv$nfold[[i]]$results[[k_sparse]])) 
        {
          selected[[(length(selected)+1)]] <- digested.result$cv$nfold[[i]]$results[[k_sparse]][[s]]
          #print(s)
        }
      }
    }
    
    # delete null if any (potentially in some folds)
    best.in.nfolds <- selected[!unlist(lapply(selected, is.null))]
    #plotPopulation(best.in.nfolds, X, y)
    # transform to dense version
    best.in.folds.dense <- listOfModelsToListOfDenseVec(clf, X, y, list.models = best.in.nfolds)
    # make a matrix
    best.in.folds.dense <- t(do.call(rbind.data.frame, best.in.folds.dense))
    rownames(best.in.folds.dense) <- rownames(X)
    
    # get the best model features in the best k-folds.
    best.in.folds.dense[best.model.dense != 0,]
    #best.in.folds.dense <- which(best.in.folds.dense!=0)
    res[[j]] <- best.in.folds.dense[best.model.dense != 0,]
    
    if(k_sparse == "k_1")
    {
      res[[j]] <- t(as.matrix(res[[j]], nrow = 1, byrow = TRUE))
      rownames(res[[j]]) <- best.model$names_
    }
  } # end k_sparsity
  
  # # compute frequency
  # res[[1]] <- t(as.matrix(res[[1]], nrow = 1, byrow = TRUE))
  # rownames(res[[1]]) <- digested.result$best$models[[1]]$names_
  s <- list()
  for(i in 1:length(res))
  {
    s[[i]] <- sort(apply(res[[i]], 1 , function(x) sum(x!=0)/length(x)), decreasing = TRUE)
  }
    
  return(list(frequency=s, presence=res))
}


#' Compute the cross-validation of leave one out for test stability
#'
#' @description Compute the cross-validation emprirical and generalization scores.
#' @param X: the data matrix with variables in the rows and observations in the columns
#' @param y: the response vector
#' @param clf: clf
#' @param lfolds: leave one out folds for cross-validation
#' @param return.all: return all results from the crossvalidation for feature stability testing
#' @return a list containing generalisation scores for each fold as well as a matrix with the mean values.
#' @export
LPO_best_models <- function(X, y, clf, p=1, lfolds=NULL, return.all=FALSE,nk=20)
{
  
  # create the folds if they are not given
  if(is.null(lfolds))
  {
    lfolds = create.folds(y, k = -p, list = TRUE, returnTrain = FALSE) 
    nfolds <- nk
  } else {
    nfolds <- length(lfolds)
  }
  
  # create the empty object structure that will be returned
  res.crossval                                <- list()
  res.crossval$nfold                          <- list()
  #res.crossval$k                              <- list()
  res.crossval$scores                         <- list()
  # res.crossval$scores$empirical.auc           <- as.data.frame(matrix(nrow=length(clf$params$sparsity), ncol=nfolds))
  # rownames(res.crossval$scores$empirical.auc) <- c(paste("k",clf$params$sparsity,sep="_"))
  # colnames(res.crossval$scores$empirical.auc) <- paste("fold",1:nfolds,sep="_")
  # # add others using the same model
  # res.crossval$scores$generalization.auc      <- res.crossval$scores$empirical.auc
  res.crossval$scores$empirical.acc           <- as.data.frame(matrix(nrow=length(clf$params$sparsity), ncol=nfolds))
  rownames(res.crossval$scores$empirical.acc) <- c(paste("k",clf$params$sparsity,sep="_"))
  colnames(res.crossval$scores$empirical.acc) <- paste("fold",1:nfolds,sep="_")
  res.crossval$scores$generalization.acc      <- res.crossval$scores$empirical.acc # accuracy
  
  res.crossval_2                               <- res.crossval
  # for each fold compute the best models
  registerDoSNOW(cl <- makeCluster(clf$params$nCores, type = "SOCK",outfile=''))
  #cl <- makeCluster(clf$params$nCores, type = "FORK",outfile='LOG.TXT')
  #registerDoParallel(cl)
  clf$params$cluster <- cl
  if(clf$params$parallelize.folds)
  {
    print("Starting cross validation in parallel")
    # execute each crossVal in //
    
    
    res.all <- foreach (i = 1:nfolds) %dorng%
    {
      
      #printClassifier(obj = clf)
      # if (clf$params$verbose) {cat("===> k-fold\t",i,"\n")}
      
      # prepare the datasets
      # training dataset
      x_train = X[,-lfolds[[i]]]
      y_train = y[-lfolds[[i]]]
      # testing dataset
      x_test = X[,lfolds[[i]]]
      y_test = y[lfolds[[i]]]
      
      runClassifier(X = x_train, y =  y_train, clf = clf)
      
    } # end of folds loop (foreach)
    
  }else # no parallel
  {
    # execute each crossval in serial
    res.all <- list()
    for (i in 1:nfolds)
    {
      #print("Starting crossvalidation not in parallel")
      if (clf$params$verbose) {cat("===> k-fold\t",i,"\n")}
      
      # prepare the datasets
      # training dataset
      x_train = X[,-lfolds[[i]]]
      y_train = y[-lfolds[[i]]]
      # testing dataset
      x_test = X[,lfolds[[i]]]
      y_test = y[lfolds[[i]]]
      
      res.all[[i]]                      <- runClassifier(X = x_train, y =  y_train, clf = clf)
      
    } # end of folds loop (for)
  } # end else parallelize.folds
  
  
  # Dispatch the results in the custom output structure
  for(i in 1:length(res.all))
  {
    
    # prepare the datasets
    # training dataset
    x_train = X[,-lfolds[[i]]]
    y_train = y[-lfolds[[i]]]
    # testing dataset
    x_test = X[,lfolds[[i]]]
    y_test = y[lfolds[[i]]]
    
    res_train <- res.all[[i]]
    
    # res_train                         <- runClassifier(X = x_train, y =  y_train, clf = clf)
    res_train.digest                    <- digestModelCollection(obj = res_train, X = x_train, clf, mmprev=FALSE)
    res_train.digest_2                  <- digestModelCollection(obj = res_train, X = x_train, clf, mmprev=TRUE)
    if(!is.null(res_train.digest))
    {
      # for all the best models of each k-sparse (create empty matrix) for auc
      # res.crossval$k$auc                <- as.data.frame(matrix(nrow=max(clf$params$sparsity), ncol=2))
      # rownames(res.crossval$k$auc)      <- c(paste("k",c(1:max(clf$params$sparsity)), sep="_")) 
      # colnames(res.crossval$k$auc)      <- c("empirical","generalization")
      # add another table for accuracy
      #res.crossval$k$acc                <- as.data.frame(matrix(nrow=max(clf$params$sparsity), ncol=2))
      # add another table for correlation
      # res.crossval$k$cor                <- res.crossval$k$auc
      
      # for all k-sparse BEST models
      for(k in 1:length(res_train.digest$best_models))
      {
        k_sparse.name       <- names(res_train.digest$best_models)[k]
        mod                 <- res_train.digest$best_models[[k_sparse.name]]
        mod.train           <- evaluateModel(mod=mod, X = x_train, y = y_train, clf = clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='train')
        mod.test            <- list() #evaluateModel(mod=mod, X=x_test, y=y_test, clf=clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='test')
        scorelist           <- getModelScore(mod = mod, X = as.matrix(x_test), clf = clf, force.re.evaluation = TRUE)
        mod$score_          <- scorelist$score_
        mod$pos_score_      <- scorelist$pos_score_
        mod$neg_score_      <- scorelist$neg_score_
        
        mod.test$accuracy_  <- evaluateAccuracy(mod = mod, X = x_test, y = y_test, clf = clf)$accuracy_
        
        # Empirical fitting score
        #res.crossval$scores$empirical.auc[k_sparse.name,i]            <- mod.train$auc_
        res.crossval$scores$empirical.acc[k_sparse.name,i]            <- mod.train$accuracy_
        #res.crossval$scores$empirical.cor[k_sparse.name,i]            <- mod.train$cor_
        
        # Generalization fitting score
        #res.crossval$scores$generalization.auc[k_sparse.name,i]       <- mod.test$auc_
        res.crossval$scores$generalization.acc[k_sparse.name,i]       <- mod.test$accuracy_
        #res.crossval$scores$generalization.cor[k_sparse.name,i]       <- mod.test$cor_
        
        # store by k
        # AUC
        #res.crossval$k$auc[k_sparse.name,"empirical"]                 <- mod.train$auc_
        #res.crossval$k$auc[k_sparse.name,"generalization"]            <- mod.test$auc_
        # Accuracy
        #res.crossval$k$acc[k_sparse.name,"empirical"]                 <- mod.train$accuracy_
        #res.crossval$k$acc[k_sparse.name,"generalization"]            <- mod.test$accuracy_
        # Regression
        #res.crossval$k$cor[k_sparse.name,"empirical"]                 <- mod.train$cor_
        #res.crossval$k$cor[k_sparse.name,"generalization"]            <- mod.test$cor_
      } # end of k_sparse loop
      
      if(return.all)
      {
        res.crossval$nfold[[i]]             <- list(results = res_train$models, resultsDigest = res_train.digest)
      }
      
      # if saving move one level up
      if(!(clf$params$popSaveFile=="NULL"))
      {
        setwd("..")
      }
    }
    if(!is.null(res_train.digest_2))
    {
      # for all the best models of each k-sparse (create empty matrix) for auc
      # res.crossval$k$auc                <- as.data.frame(matrix(nrow=max(clf$params$sparsity), ncol=2))
      # rownames(res.crossval$k$auc)      <- c(paste("k",c(1:max(clf$params$sparsity)), sep="_")) 
      # colnames(res.crossval$k$auc)      <- c("empirical","generalization")
      # add another table for accuracy
      #res.crossval$k$acc                <- as.data.frame(matrix(nrow=max(clf$params$sparsity), ncol=2))
      # add another table for correlation
      # res.crossval$k$cor                <- res.crossval$k$auc
      
      # for all k-sparse BEST models
      for(k in 1:length(res_train.digest_2$best_models))
      {
        k_sparse.name       <- names(res_train.digest_2$best_models)[k]
        mod                 <- res_train.digest_2$best_models[[k_sparse.name]]
        mod.train           <- evaluateModel(mod = mod, X = x_train, y = y_train, clf = clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='train')
        mod.test            <- list() #evaluateModel(mod=mod, X=x_test, y=y_test, clf=clf, eval.all = TRUE, force.re.evaluation = TRUE, mode='test')
        scorelist           <- getModelScore(mod = mod, X = as.matrix(x_test), clf = clf, force.re.evaluation = TRUE)
        mod$score_          <- scorelist$score_
        mod$pos_score_      <- scorelist$pos_score_
        mod$neg_score_      <- scorelist$neg_score_
        
        mod.test$accuracy_  <- evaluateAccuracy(mod = mod, X = x_test, y = y_test, clf = clf)$accuracy_
        
        # Empirical fitting score
        #res.crossval$scores$empirical.auc[k_sparse.name,i]            <- mod.train$auc_
        res.crossval_2$scores$empirical.acc[k_sparse.name,i]            <- mod.train$accuracy_
        #res.crossval$scores$empirical.cor[k_sparse.name,i]            <- mod.train$cor_
        
        # Generalization fitting score
        #res.crossval$scores$generalization.auc[k_sparse.name,i]       <- mod.test$auc_
        res.crossval_2$scores$generalization.acc[k_sparse.name,i]       <- mod.test$accuracy_
        #res.crossval$scores$generalization.cor[k_sparse.name,i]       <- mod.test$cor_
        
        # store by k
        # AUC
        #res.crossval$k$auc[k_sparse.name,"empirical"]                 <- mod.train$auc_
        #res.crossval$k$auc[k_sparse.name,"generalization"]            <- mod.test$auc_
        # Accuracy
        #res.crossval$k$acc[k_sparse.name,"empirical"]                 <- mod.train$accuracy_
        #res.crossval$k$acc[k_sparse.name,"generalization"]            <- mod.test$accuracy_
        # Regression
        #res.crossval$k$cor[k_sparse.name,"empirical"]                 <- mod.train$cor_
        #res.crossval$k$cor[k_sparse.name,"generalization"]            <- mod.test$cor_
      } # end of k_sparse loop
      
      if(return.all)
      {
        res.crossval_2$nfold[[i]]             <- list(results = res_train$models, resultsDigest = res_train.digest_2)
      }
      
      # if saving move one level up
      if(!(clf$params$popSaveFile=="NULL"))
      {
        setwd("..")
      }
    }
  }
  
  # should we return all
  if(return.all)
  {
    names(res.crossval$nfold)             <- paste("n", 1:nfolds, sep = "_") # they should have the same length
    names(res.crossval_2$nfold)             <- paste("n", 1:nfolds, sep = "_") # they should have the same length
  }
  
  reorderByRownamesNumeric <- function(mat)
  {
    ind <- as.numeric(gsub("k_","",rownames(mat)))
    mat <- mat[order(ind),]
  }  
  # reorder results
  res.crossval$scores$empirical.acc       <- reorderByRownamesNumeric(res.crossval$scores$empirical.acc)
  res.crossval$scores$generalization.acc  <- reorderByRownamesNumeric(res.crossval$scores$generalization.acc)

  # accuracy
  res.crossval$scores$mean.acc            <- data.frame(cbind(rowMeans(res.crossval$scores$empirical.acc, na.rm = TRUE),
                                                              rowMeans(res.crossval$scores$generalization.acc, na.rm = TRUE)))
  colnames(res.crossval$scores$mean.acc)  <- c("empirical","generalization")
  
  res.crossval_2$scores$empirical.acc       <- reorderByRownamesNumeric(res.crossval_2$scores$empirical.acc)
  res.crossval_2$scores$generalization.acc  <- reorderByRownamesNumeric(res.crossval_2$scores$generalization.acc)
  res.crossval_2$scores$mean.acc            <- data.frame(cbind(rowMeans(res.crossval_2$scores$empirical.acc, na.rm = TRUE),
                                                              rowMeans(res.crossval_2$scores$generalization.acc, na.rm = TRUE)))
  colnames(res.crossval_2$scores$mean.acc)  <- c("empirical","generalization")
  ret <- c()
  ret$res.crossval <- res.crossval
  ret$res.crossval_2 <- res.crossval_2

  stopCluster(cl)
  
  return(ret)
}


#' analyse stability of models from digest
#'
#' @description This function analyses prevalence of features of best model of different sparsity in crossval (here still k-folds) 
#' @param X: dataset to classify
#' @param y: variable to predict
#' @param clf: an object containing the different parameters of the classifier
#' @param tmp: the digested result object from digest
#' @return a list of each sparsity the frequency of each feature of empirical best model in k-folds cross validation
#' @export
AnalyseStableModels_LOO <- function(X, y, clf, tmp, loo)
{
  #loo <- LOO_best_models(X,y,clf,return.all = TRUE)
  res <- list()
  for(j in 1:length(tmp$best$models)) {
    best.model <- tmp$best$models[[j]]
    printModel(mod = best.model, method = clf$params$print_ind_method, score = "fit_")
    # |-40|+295|-304|-373|F=0.8306|K=4
    best.model.dense <-
      modelToDenseVec(natts = nrow(X), mod = best.model)
    
    best.in.nfolds  <- list()
    n <- 1
    k_sparse <- paste("k", best.model$eval.sparsity, sep = "_")
    
    for (i in 1:length(loo$nfold))
    {
      selected <-
        ceiling(length(loo$nfold[[i]]$results[[k_sparse]]) * 0.1)
      for (s in 1:selected) {
        best.in.nfolds[[n]] <- loo$nfold[[i]]$results[[k_sparse]][[s]]
        
        n <- n + 1
      }
    }
    
    best.in.nfolds <-
      best.in.nfolds[!unlist(lapply(best.in.nfolds, is.null))]
    #plotPopulation(best.in.nfolds, X, y)
    best.in.folds.dense <- listOfModelsToListOfDenseVec(clf, X, y, list.models = best.in.nfolds)
    
    best.in.folds.dense <-
      t(do.call(rbind.data.frame, best.in.folds.dense))
    rownames(best.in.folds.dense) <- rownames(X)
    
    # get the best model features in the best k-folds.
    best.in.folds.dense[best.model.dense != 0,]
    #best.in.folds.dense <- which(best.in.folds.dense!=0)
    res[[j]] <- best.in.folds.dense[best.model.dense != 0,]
    
  }
  res[[1]] <- t(as.matrix(res[[1]], nrow = 1, byrow = TRUE))
  rownames(res[[1]]) <- tmp$best$models[[1]]$names_
  s<- list()
  for(i in 1:length(res))
    s[[i]]<-sort(apply(res[[i]],1,function(x)length(which(x!=0))/length(x)),decreasing = TRUE)
  return(list(freq=s,origin=res)) 
  
}

#' getGraph 
#'
#' @description This function gets a graph of the result of analyseStableModels
#' @param X: dataset to classify
#' @param mat: AnalyseStableModels()$origin
#' @param threshold: Used to select a number of edgets. By default this is zero.
#' @return a graph
#' @export
getGraph <- function(mat, X, threshold=0){
  links <- array(0, dim=c(nrow(X),nrow(X)+1))
  rownames(links) <- rownames(X)
  colnames(links) <- c(rownames(X),'nodes')
  for(i in 1:length(mat))
  {
    stas <- apply(mat[[i]],1,function(x)length(which(x!=0))/length(x))
    for(j in 1:length(stas))
      links[names(stas[j]),'nodes'] <- links[names(stas[j]),'nodes'] + as.numeric(stas[j])
    if(i>1){
      comb <- combn(names(stas), 2)
      for(j in 1:dim(comb)[2]){
        links[comb[1,j],comb[2,j]] <- links[comb[1,j],comb[2,j]]+1
        links[comb[2,j],comb[1,j]] <- links[comb[2,j],comb[1,j]]+1
      }
    }
  }
  g <- graph.adjacency(links[,1:nrow(X)], mode="undirected", weighted=TRUE)
  V(g)$size   <- links[,'nodes']
  E(g)$width  <- E(g)$weight
  
  iso         <- V(g)[degree(g)==0]
  g2 <- delete.vertices(g, iso)
  
  iso_2       <- V(g2)[V(g2)$size==0] 
  g2 <- delete.vertices(g2, iso_2)
  
  co <- layout_nicely(g2)
  plot(g2, layout=co, vertex.label.cex=0.5)
  return(g2)
}

#' compare stability of different modeles (intra k)
#'
#' @description This function compares stability of different modeles (intra k)
#' @param X: dataset to classify
#' @param tmp: the digested result from digest
#' @return a num
#' @export
sim_intra <- function(tmp, X)
{
  n <- dim(X)[1]
  s <- list()
  #t <- list()
  for(i in 1:length(tmp$best$models)){
    s[[i]] <- tmp$best$models[[i]]$indices_
    #t[[i]] <- tmp$cv$nfold$n_1$resultsDigest$best_models[[i]]$indices_
  }
  
  c <- length(s)
  sim <- 0
  #sim2 <- 0
  for(i in 1:(c-1))
    for(j in (i+1):c){
      sim <- sim +  (length(intersect(s[[i]],s[[j]]))-length(s[[i]])*length(s[[j]])/n)/(min(length(s[[i]]),length(s[[j]]))-max(0,length(s[[i]])+length(s[[j]])-n))
      #sim2 <- sim2 +  (length(intersect(t[[i]],t[[j]]))-length(t[[i]])*length(t[[j]])/n)/(min(length(t[[i]]),length(t[[j]]))-max(0,length(t[[i]])+length(t[[j]])-n))
    }
  
  return(sim*2/(c*(c-1)))#list(sim*2/(c*(c-1)),sim2*2/(c*(c-1))))
}

conf.inter <- function (r, n)
{
  s <- sqrt(r * (1 - r)/n)
  ci <- 0.5/n + 1.96 * s
  return(r - ci)#1.96*s)
}
# r : error rate of the best classifier
# n : number of test examples
# return the upper bound of the error rate 



#' compare stability of different modeles (inter k)
#'
#' @description This function compares stability of different modeles (inter k)
#' @param X: dataset to classify
#' @param tmp: the digested result from digest
#' @return a num
#' @export
sim_inter <- function(tmp, X)
{
  n <- dim(X)[1]
  kunch <- c()
  kunch_oppo <- c()
  kalousis <- c()
  dunne <- c()
  cw <- c()
  cw_rel <- c()
  auc_ <- c()
  acc_ <- c()
  language <- list()
  language$ratio <- list()
  language$terinter <- list()
  language$bininter <- list()
  lang_cv<-list()
  best_spar <- names(tmp$best$models)
  
  # compute the model in the conf_inter
  accuracy <- tmp$best$scores$accuracy_
  Accuracy <- max(accuracy)
  ci <- conf.inter(Accuracy, dim(X)[2])
  
  for(k in best_spar){
    
    t <- list()
    coef <- list()
    auc <- c()
    acc <- c()
    lang <- c()
    for (i in 1:length(tmp$cv$nfold)) {
      mod <- tmp$cv$nfold[[i]]$resultsDigest$best_models[[k]]
      if(length(mod)==0 || mod$accuracy_ < ci ){
        t[[i]] <- NA
        coef[[i]] <- NA
        auc[i] <- NA
        acc[i] <- NA
        lang[i] <- NA    
      }else{
        t[[i]] <- tmp$cv$nfold[[i]]$resultsDigest$best_models[[k]]$indices_
        if(is.null(tmp$cv$nfold[[i]]$resultsDigest$best_models[[k]]$coeffs_)){
          coef[[i]] <- 0
        }else{coef[[i]] <- tmp$cv$nfold[[i]]$resultsDigest$best_models[[k]]$coeffs_}
        
        auc[i] <- tmp$cv$nfold[[i]]$resultsDigest$best_models[[k]]$auc_
        acc[i] <- tmp$cv$nfold[[i]]$resultsDigest$best_models[[k]]$accuracy_
        lang[i] <- tmp$cv$nfold[[i]]$resultsDigest$best_models[[k]]$language
      }
    }
    
    
    # remove NA
    t <- t[!is.na(t)]
    coef <- coef[!is.na(coef)]
    auc <- auc[!is.na(auc)]
    acc <- acc[!is.na(acc)]
    
    c <- length(t)
    if(c==0 ||  c==1){
      kunch <- c(kunch,0)
      kunch_oppo <- c(kunch_oppo ,0)
      kalousis <- c( kalousis,0)
      dunne  <- c(dunne ,0)
      cw <- c(cw,0)
      cw_rel <- c(cw_rel,0)
      auc_ <- c(auc_,0)
      acc_ <- c(acc_,0)
    }else{
      sim_1 <- 0
      sim_2 <- 0
      sim_3 <- 0
      sim_4 <- 0
      for(i in 1:(c-1))
        for(j in (i+1):c){
          a <- coef[[i]]
          b <- coef[[j]]
          oppo <- sum(unname(a[intersect(names(a),names(b))]!=b[intersect(names(a),names(b))]))
          
          inter <- intersect(t[[i]],t[[j]])
          sim_1 <- sim_1 +  (length(inter)-length(t[[i]])*length(t[[j]])/n)/(min(length(t[[i]]),length(t[[j]]))-length(t[[i]])*length(t[[j]])/n)#max(0,length(t[[i]])+length(t[[j]])-n))
          sim_2 <- sim_2 +  length(inter)/length(union(t[[i]],t[[j]]))
          sim_3 <- sim_3 + (length(setdiff(t[[i]],t[[j]]))+length(setdiff(t[[j]],t[[i]])))/n
          sim_4 <- sim_4 +  (length(inter) - oppo/2 -length(t[[i]])*length(t[[j]])/n)/(min(length(t[[i]]),length(t[[j]]))-length(t[[i]])*length(t[[j]])/n)
          
        }
      kunch <- c(kunch,sim_1*2/(c*(c-1)))
      kunch_oppo <- c(kunch_oppo,sim_4*2/(c*(c-1)))
      kalousis <- c(kalousis,sim_2*2/(c*(c-1)))
      dunne <- c(dunne,sim_3*2/(c*(c-1)))
      tab_t <- table(unlist(t))
      
      cw <- c(cw,sum(apply(tab_t,1,function(x) x*(x-1)/((c-1)*sum(tab_t)))))
      D <- sum(tab_t) %%  n
      H <- sum(tab_t) %% c
      cw_rel <- c(cw_rel,(n*(sum(tab_t)-D+sum(apply(tab_t,1,function(x) x*(x-1))))-sum(tab_t)*sum(tab_t)+D*D)/(n*(H*H+c*(sum(tab_t)-H)-D)-sum(tab_t)*sum(tab_t)+D*D))
      
      auc_ <- c(auc_,mean(auc))
      acc_ <- c(acc_,mean(acc))
    }
    if(length(t[which(lang=='ratio')])!=0){
      language$ratio[(length(language$ratio)+1):(length(language$ratio)+length(t[which(lang=='ratio')]))] <- t[which(lang=='ratio')]
    }
    if(length(t[which(lang=='terinter')])!=0){
      language$terinter[(length(language$terinter)+1):(length(language$terinter)+length(t[which(lang=='terinter')]))] <- t[which(lang=='terinter')]
    }
    if(length(t[which(lang=='bininter')])!=0){
      language$bininter[(length(language$bininter)+1):(length(language$bininter)+length(t[which(lang=='bininter')]))] <- t[which(lang=='bininter')]
    }
    lang_cv[[k]] <- table(lang)
    
  }
  rel <- function(t,n){
    tab_t <- table(unlist(t))
    c <- length(t)
    D <- sum(tab_t) %%  n
    H <- sum(tab_t) %% c
    cw_rel <- (n*(sum(tab_t)-D+sum(apply(tab_t,1,function(x) x*(x-1))))-sum(tab_t)*sum(tab_t)+D*D)/(n*(H*H+c*(sum(tab_t)-H)-D)-sum(tab_t)*sum(tab_t)+D*D)
    return(cw_rel)
  }
  
  #names(s) <-  names(tmp$best$models)
  res <- c()
  res$sparsity <- unlist(lapply(best_spar, function(x) as.numeric(unlist(strsplit(x, "k_"))[2])))
  res$kunch <- kunch
  res$kunch_oppo <- kunch_oppo
  res$kalousis <- kalousis
  res$dunne <- dunne
  res$cw <- cw
  res$cw_rel <- cw_rel
  res$lang_cv <- lang_cv
  res$cw_rel_languages <- unlist(lapply(language, function(x) rel(x,n)))
  res$language <-unname( unlist(lapply(tmp$best$models,function(x)x$language)))
  res$auc_ <- auc_
  res$acc_ <- acc_
  return(res)
}

pop_better <- function(mod.col,eval='fit_',k_penalty=0.01){
  if(isPop(mod.col)){
    pop <- mod.col
    mod.col <- listOfModels2ModelCollection(mod.col)
  }else{
    pop <- modelCollectionToPopulation(mod.col)
  }
  bests   <- getBestIndividual(mod.col,evalToOrder = eval)
  ev      <- populationGet_X(element2get = eval, toVec = TRUE, na.rm = TRUE)(bests)
  names   <- names(ev)
  ev_2    <- shift(ev,1)
  ev_2[1] <- 0.5
  names(ev_2) <- names
  new_pop <- lapply(pop,function(x){
    
    if(as.numeric(x[eval]) > unname(ev_2[paste('k',x$eval.sparsity,sep = '_')] + k_penalty)){
      return(x)
    }
  })
  new_pop      <- new_pop[unlist(lapply(new_pop,is.null))==FALSE]
  mod.res      <- listOfModels2ModelCollection(new_pop)
  return(new_pop)
}



#ggplot()+geom_point(aes(x=res$auc_,y=res$,color=res$language))
                    
f<-function(tmp,X){
  l <- lapply(tmp$best$models,function(x)x$indices_)
  all <- sort(unique(unlist(unname(l))))
  mat <- matrix(0,nrow=length(all),ncol=length(l))
  rownames(mat) <- all
  for(i in 1:length(l)){
    mat[as.character(unlist(unname(l[i]))) ,i] <- 1
  }
  #image(t(mat))
  heatmap(mat)
}                    

# load("../../2.db_cirrhose_k_species_stage1/6.metal_languages_1K_models/results.metal_spar_v2m_new1_to_30.rda")
# load("../../2.db_cirrhose_k_species_stage1/3.terga2_languages_3K_models/results.terga2.terinter_spar_1_to_30.rda")
# load("../../../data/pasolli_2016/cirrhosis_stage_1_known_species.rda")
# 
# 
# load("../../2.db_cirrhose_k_species_stage2/6.metal_languages_1K_models/results.metal_spar_v2m_new1_to_30.rda")
# load("../../../data/pasolli_2016/cirrhosis_stage_2_known_species.rda")
# 
# load("../../2.db_colorectal_k_species/6.metal_languages_1K_models/results.metal_spar_v2m_new1_to_30.rda")
# load("../../../data/pasolli_2016/colorectal_known_species.rda")
# 
# load("../../2.db_ibd_k_species/6.metal_languages_1K_models/results.metal_spar_v2m_new1_to_30.rda")
# load("../../../data/pasolli_2016/ibd_known_species.rda")
# 
# load("../../2.db_obesity_k_species/6.metal_languages_1K_models/results.metal_spar_v2m_new1_to_30.rda")
# load("../../../data/pasolli_2016/obesity_known_species.rda")
# 
# load("../../2.db_t2d_k_species/6.metal_languages_1K_models/results.metal_spar_v2m_new1_to_30.rda")
# load("../../../data/pasolli_2016/t2d_known_species.rda")
# 
# load("../../2.db_wt2d_k_species/6.metal_languages_1K_models/results.metal_spar_v2m_new1_to_30.rda")
# load("../../../data/pasolli_2016/wt2d_known_species.rda")
# X <- X_filt
# tmp <- digest(res.metal.cv.v2m_new)
# res <- sim_inter(tmp,X)
# f(tmp,X)
# #ggplot for res$lang_cv
# 
# ggplot(melt(res$lang_cv)) +aes(lang, value,fill=factor(lang),color=factor(lang))+
#   geom_bar(stat='identity') +
#   facet_wrap(~L1)
# 
# #plot for result of sim_inter
# ggplot()+geom_line(aes(x=c(1:length(res$kunch)),y=res$kunch,color=factor("kunch")))+
#   geom_line(aes(x=c(1:length(res$kunch)),y=res$kunch_oppo,color=factor('kunch_oppo')))+
#   geom_line(aes(x=c(1:length(res$kunch)),y=res$kalousis,color=factor('kalousis')))+
#   geom_line(aes(x=c(1:length(res$kunch)),y=res$dunne,color=factor('dunne')))+
#   geom_line(aes(x=c(1:length(res$kunch)),y=res$cw,color=factor('cw')))+
#   geom_line(aes(x=c(1:length(res$kunch)),y=res$acc_,color=factor('acc')))+
#   geom_line(aes(x=c(1:length(res$kunch)),y=res$auc_,color=factor('auc')))+
#   xlab('sparsity')+ylab('stability')+labs(colour = "measures")
# 
# ggplot()+geom_line(aes(x=res$auc_,y=res$kunch,color=factor("kunch")))+
#   geom_line(aes(x=res$auc_,y=res$kalousis,color=factor('kalousis')))+
#   geom_line(aes(x=res$auc_,y=res$dunne,color=factor('dunne')))+
#   geom_line(aes(x=res$auc_,y=res$cw,color=factor('cw')))+
#   geom_line(aes(x=res$auc_,y=res$acc_,color=factor('acc')))+
#   xlab('auc')+ylab('stability')+labs(colour = "measures")+
#   geom_text(aes(label = names(res$auc_)))
# 
# 
# auc<-lapply(res.metal.cv.v2m_new$classifier$models,function(x) lapply(x,function(y)y$auc_))
# 
# ind <- unname(lapply(res.metal.cv.v2m_new$classifier$models,function(x) lapply(x,function(y)y$indice)))


getGraph_features <- function(ind,X){
  library('binhf')
  dig<-digest(res.metal.cv.v2m_new)
  auc<-dig$best$scores$auc_
  name<-names(auc)
  #unname(unlist(lapply(dig$best$models,function(x)x$auc_)))
  auc<-unname(shift(auc,1))
  names(auc)<-name
  auc[1] <- 0
  ind <- list()
  score <- list()
  n   <- 1
  for(i in 1:length(res.metal.cv.v2m_new$classifier$models)){
    t<- unique(res.metal.cv.v2m_new$classifier$models[[i]])
    for(j in 1:length(t)){
      if(t[[j]]$auc_ >= auc[i]){
        ind[[n]] <- t[[j]]$coeffs_
        score[n] <- t[[j]]$accuracy_
        n<-n+1
      }
    }
  }
  
  feat <- array(0, dim=c(length(unique(names(unlist(ind)))),30))
  rownames(feat)<-unique(names(unlist(ind)))
  for(i in 1:length(ind)){
    for(j in 1:length(ind[[i]])){
      feat[names(ind[[i]])[j],length(ind[[i]])] <-feat[names(ind[[i]])[j],length(ind[[i]])] +1 
    }
  }
  heatmap.2(feat,Rowv=NA,Colv = NA)
  
  links <- array(0, dim=c(nrow(X),nrow(X)+2))
  rownames(links) <- rownames(X)
  colnames(links) <- c(rownames(X),'nodes','scores')
  for(i in 1:length(ind)){
    if(length(ind[[i]])==1){
      links[names(ind[[i]]),'nodes'] <- links[names(ind[[i]]),'nodes']+1
    }else{
    pairs <- combn(names(ind[[i]]),2)
    for(j in 1:length(pairs[1,])){
      links[pairs[1,j],pairs[2,j]]<-links[pairs[1,j],pairs[2,j]]+1
      links[pairs[2,j],pairs[1,j]]<-links[pairs[2,j],pairs[1,j]]+1
       }
    for(j in 1:length(ind[[i]])){
      links[names(ind[[i]])[j],'nodes'] <- links[names(ind[[i]])[j],'nodes']+1
      links[names(ind[[i]])[j],'scores'] <- links[names(ind[[i]])[j],'scores']+as.numeric(score[[i]])
       }
     }
  }
  links[,'scores']<-links[,'scores']/links[,'nodes']
  links[is.na(links)] <- 0
  g <- graph.adjacency(links[,1:nrow(X)], mode="undirected", weighted=TRUE)
  V(g)$size   <- links[,'scores']*5
  E(g)$width  <- E(g)$weight
  
  iso         <- V(g)[degree(g)==0]
  g2 <- delete.vertices(g, iso)
  
  iso_2       <- V(g2)[V(g2)$size==0] 
  g2 <- delete.vertices(g2, iso_2)
  # 
  iso_3       <- E(g2)[E(g2)$width< max(E(g2)$weight)-5] 
  g2 <- delete.edges(g2, iso_3)
  
  iso_4         <- V(g2)[degree(g2)==0]
  g3 <- delete.vertices(g2, iso_4)
  
  co <- layout_nicely(g2)
  co <- layout.fruchterman.reingold(g3)
  plot(g3, layout=co, vertex.label.cex=0.8)
}
# m <- melt(ind)
# names(m) <- c('features','num','k')
# lapply(1:30, function(x)m[m$k>x,])
# 
# #inds <- lapply(ind,function(x)lapply(x,function(y)lapply(y,unlist)) )
# #inds <- lapply(ind,function(x)unlist(lapply(x,function(y) unlist(y)))) 
# inds <- list()
# n <- 1
# for(i in 1:length(ind)){
#   #inds[[n]] <- 
#   for(j in 1:length(ind[[i]])){
#     
#     inds[[n]]<-ind[[i]][[j]]
#     n<-n+1
#   }
# }
# 
# d<-digest_mmprev(clf.metal.v2m_new,X,res.metal.cv.v2m_new)
predomics/predomicspkg documentation built on Dec. 11, 2024, 11:06 a.m.