R/analyzeImportantFeaturesFBM.R

Defines functions plotImportanceFeaturesFBMobjects getImportanceFeaturesFBMobjects computeEffectSizes analyzeImportanceFeaturesFBM

Documented in analyzeImportanceFeaturesFBM computeEffectSizes getImportanceFeaturesFBMobjects plotImportanceFeaturesFBMobjects

#' Analyze and Plot Feature Importance for Feature-Based Models (FBM)
#'
#' This function analyzes and visualizes feature importance for FBM models,
#' creating plots for feature prevalence, importance, and effect size. It
#' supports both single and multiple experiments, handling classification and
#' regression modes.
#' @import stats grDevices
#' @param clf_res A classifier result object or a list of classifier results.
#'   Each classifier result must contain the trained models and the classifier
#'   parameters.
#' @param X The feature matrix.
#' @param y The response variable.
#' @param makeplot Logical, if `TRUE` (default), plots will be generated and
#'   displayed.
#' @param saveplotobj Logical, if `TRUE` (default), the plot objects will be
#'   saved as an RData file.
#' @param name A character string for the output file name prefix.
#' @param verbose Logical, if `TRUE` (default), progress messages are printed.
#' @param pdf.dims A numeric vector specifying the width and height of the PDF
#'   output.
#' @param filter.cv.prev Numeric, a cutoff for the cross-validation prevalence
#'   filter.
#' @param nb.top.features Numeric, the number of top features to display.
#' @param scaled.importance Logical, if `TRUE`, scales feature importance
#'   values.
#' @param k_penalty Numeric, a penalty factor applied to control model selection
#'   based on sparsity.
#' @param k_max Numeric, maximum number of features to include.
#'
#' @details This function examines the FBM classifier results, retrieves the
#' best population of models, calculates feature importance, and generates plots
#' to visualize feature prevalence, importance, and effect size. If multiple
#' experiments are provided, results are averaged across experiments.
#'
#' **Generated Plots**:
#'   - **Feature Prevalence (FBM)**: Shows the frequency and orientation of features across models.
#'   - **Feature Importance**: Visualizes feature importance scores.
#'   - **Effect Sizes**: Displays the effect sizes of features.
#'
#' For classification, Cliff's delta is used for effect sizes, and Spearman's
#' rho is used for regression.
#'
#' @return If `makeplot` is `TRUE`, a combined plot of feature importance,
#'   prevalence, and effect sizes is returned. Otherwise, a list of individual
#'   plot objects.
#'
#' @examples
#' \dontrun{
#' analyzeImportanceFeaturesFBM(clf_res = my_clf_result, X = X_data, y = y_data, name = "example_analysis")
#' }
#'
#' @export
analyzeImportanceFeaturesFBM <- function(clf_res, 
                                         X, 
                                         y, 
                                         makeplot = TRUE, 
                                         saveplotobj = TRUE, 
                                         name = "", 
                                         verbose = TRUE, 
                                         pdf.dims = c(width = 25, height = 20), 
                                         filter.cv.prev = 0.25, 
                                         nb.top.features = 100, 
                                         scaled.importance = FALSE, 
                                         k_penalty = 0.75/100,
                                         k_max = 0)
{
  multiple.experiments <- FALSE
  mode <- NULL
  
  if(!isExperiment(clf_res))
  {
    if(any(!unlist(lapply(clf_res, isExperiment))))
    {
      stop("analyzeImportanceFeaturesFBM: please provide a valid experiment results!")  
    }
    multiple.experiments <- TRUE
    
    if(clf_res[[1]]$classifier$params$objective == "cor")
    {
      mode <- "regression"
    }else
    {
      mode <- "classification"
    }
    
  }else # if not multiple experiments
  {
    if(clf_res$classifier$params$objective == "cor")
    {
      mode <- "regression"
    }else
    {
      mode <- "classification"
    }
  }
  
  if(!is.null(mode)) 
  {
    cat(paste("... Estimating mode: ", mode,"\n"))
  }else
  {
    stop("analyzeImportanceFeaturesFBM: mode not founding stopping ...")
  }
  
  # If this is a single experiment
  if(!multiple.experiments)
  {
    # get the final population
    pop <- modelCollectionToPopulation(clf_res$classifier$models)
    if(verbose) print(paste("There are",length(pop), "models in this population"))
    # select the best population
    pop <- selectBestPopulation(pop = pop, 
                                score = clf_res$classifier$params$evalToFit, 
                                p = 0.05, k_penalty = k_penalty, k_max = k_max)
    if(verbose) print(paste("There are",length(pop), 
                            "models in this population after selection of the best"))
    
    if(length(pop) == 1)
    {
      print("analyzeImportanceFeatures: only one model after filtering. 
            Plot can not be built... returing empty handed.")
      return(NULL)
    }
    
    # get the population information into a dataframe
    pop.df <- populationToDataFrame(pop = pop)
    # get the feature to model dataframe
    pop.noz <- listOfModelsToDenseCoefMatrix(clf = clf_res$classifier, 
                                             X = X, y = y, list.models = pop)
    if(verbose) print(paste("Pop noz object is created with", nrow(pop.noz), 
                            "features and", ncol(pop.noz), "models"))
    
    # make the feature annots
    fa <- makeFeatureAnnot(pop = pop, X = X, y = y, clf = clf_res$classifier)
    pop.noz <- fa$pop.noz
    
    # get the feature importance information if it exists; do it for all features in X (subsequent filtering from FBM)
    lr <- list(clf_res)
    names(lr) <- paste(clf_res$classifier$learner, clf_res$classifier$params$language, sep=".")
    
    feat.import <- mergeMeltImportanceCV(list.results = lr, 
                                         filter.cv.prev = filter.cv.prev, 
                                         min.kfold.nb = FALSE, 
                                         learner.grep.pattern = "*", 
                                         # nb.top.features = nb.top.features, 
                                         nb.top.features = nrow(X),
                                         feature.selection = NULL,
                                         scaled.importance = scaled.importance,
                                         make.plot = TRUE)
    if(is.null(feat.import))
    {
      print("analyzeImportanceFeatures: no feature importance data found... returning empty handed.")
      return(NULL)
    }
    
    # Process the feat.import$summary to wide format (will be used for filtering features to plot)
    feat.import.dcast <- data.table::dcast(data = feat.import$summary, formula = feature~method, value.var="value")
    rownames(feat.import.dcast) <- feat.import.dcast$feature ; feat.import.dcast <- feat.import.dcast[,-1,drop=FALSE]
    feat.import.dcast$meanval <- rowMeans(feat.import.dcast, na.rm = TRUE)
    feat.import.dcast$feature <- factor(rownames(feat.import.dcast), levels = rownames(feat.import.dcast)[order(feat.import.dcast$meanval, decreasing = TRUE)])
    
    # Do the selection of features from feat.import.dcast
    features.import <- feat.import.dcast[rownames(pop.noz),]
    features.import$feature <- droplevels(features.import$feature)
    
    # if the number of features in FBM is higher than the nb.top.features, 
    # select the top nb.top.features in FBM based on feature importance
    if(nrow(features.import) > nb.top.features) 
    {
      print(paste(nrow(features.import), " features in FBM vs ", nb.top.features, 
                  " nb.top.features; select the ", nb.top.features, 
                  " top features with higher feature importance", sep = ""))
      features.import.vec <- levels(features.import$feature)[1:nb.top.features]
    } else # else, keep all features in FBM
    {
      print(paste(nrow(features.import), " features in FBM vs ", nb.top.features, 
                  " nb.top.features; reporting all features in FBM", sep = ""))
      features.import.vec <- levels(features.import$feature)
    }
    
    # ----------------------------------
    # Build the feature prevalence plot
    # ----------------------------------
    g1.df <- data.frame(pop.noz)
    g1.df$feature <- rownames(g1.df) ; g1.df <- melt(g1.df)
    g1.df$learner <- unlist(lapply(strsplit(as.character(g1.df$variable), split="_"), function(x){x[1]}))
    #add the language from clf object
    g1.df$learner <- paste(g1.df$learner, clf_res$classifier$params$language, sep = ".")
    g1.df$model <- as.character(unlist(lapply(strsplit(as.character(g1.df$variable), split="_"), function(x){x[2]})))
    #Subset g1.df to target features
    g1.df <- g1.df[g1.df$feature %in% features.import.vec,]
    g1.df$feature <- factor(g1.df$feature, levels=rev(features.import.vec))
    g1.df$value <- factor(g1.df$value, levels = c(-1,0,1))
    g1.df$value <- droplevels(g1.df$value)
    g1.df.cols <- c("deepskyblue1","white","firebrick1")
    names(g1.df.cols) <- c("-1","0","1")
    g1.df.cols <- g1.df.cols[levels(g1.df$value)]
    g1 <- ggplot(g1.df, aes(x=model, y=feature, fill=value)) + 
      geom_tile(colour=NA) + 
      facet_grid(.~learner, scales = "free_x", space = "free_x") + 
      scale_fill_manual(values = g1.df.cols) + 
      xlab("FBM") + 
      theme_classic() + 
      theme(legend.position = "none",
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank())
    
    
    # ----------------------------------
    # get the importance graphic
    # ----------------------------------
    g6.df <- feat.import$summary
    g6.df <- g6.df[g6.df$feature %in% features.import.vec,]
    g6.df$feature <- factor(g6.df$feature, levels=rev(features.import.vec))
    g6.df$sign <- factor(g6.df$sign, levels=c("-1","0","1"))
    g6.df$sign <- droplevels(g6.df$sign)
    g6.col <- c("deepskyblue1","white","firebrick1")
    names(g6.col) <- c("-1","0","1")
    g6.col <- g6.col[levels(g6.df$sign)]
    g6 <- ggplot(g6.df, aes(x=feature, y=value)) + 
      # the prevalence data on the bottom
      # geom_bar(data = fprev.melt, stat="identity", position="identity", aes(fill = "1")) +
      geom_hline(yintercept = min(0, g6.df$value, na.rm = TRUE), col="gray") +
      # ylab("Feature importance & prevalence (CV)") +
      ylab("Feature importance") +
      xlab("") +
      facet_grid(.~method, scales = "free") + 
      theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
      theme_bw() +
      coord_flip() +
      # scale_fill_manual("Dataset", values = c("gray90","gray90")) +
      scale_color_manual("Dataset", values = g6.col) +
      # the importance data 
      geom_errorbar(aes(ymin = value - se, ymax = value + se, color = sign), width=.1, position=position_dodge(0.3)) +
      #geom_line(aes(group = feature, color = sign), position=pd) +
      geom_point(position = position_dodge(0.3), size=2, shape=19, aes(color = sign)) + # 21 is filled circle
      guides(colour = "none", fill = "none")
    g6 <- g6 + theme(axis.text.y = element_blank(),
                     strip.background = element_rect(fill = NA))
    
    # ----------------------------------
    # get the prevalence graphic
    # ----------------------------------
    if(mode == "regression")
    {
      g7 <- plotPrevalence(features = features.import.vec, X, y = NULL)
    }else
    {
      g7 <- plotPrevalence(features = features.import.vec, X, y)
    }
    g7 <- g7 + theme(axis.title.y=element_blank(),
                     axis.text.y=element_blank(),
                     axis.ticks.y=element_blank())
    
    
    # ----------------------------------
    # get the effect size graphic
    # ----------------------------------
    g8.df <- computeEffectSizes(X = X, y = y, mode = mode)
    #subset g8 to features.import
    g8.df <- g8.df[g8.df$feature %in% features.import.vec,]
    g8.df$feature <- factor(g8.df$feature, levels=rev(features.import.vec))
    # add FDR adjustment + labels
    if(mode == "classification")
    {
      g8.df$fdr <- stats::p.adjust(g8.df$pval.wilcox, method = "BH")
      g8.df$label <- ifelse(g8.df$fdr<0.05,"**", ifelse(g8.df$pval.wilcox<0.05,"*",""))
      g8 <- ggplot(g8.df, aes(x=feature, y=cdelta, fill=cdelta)) + 
        geom_bar(stat = "identity") + 
        geom_text(aes(label=label)) + 
        scale_fill_gradient(low = "deepskyblue1", high = "firebrick1", limits=c(-1,1)) + 
        ylab("Cliff's delta 1 vs. -1") + 
        ggtitle("") + 
        coord_flip() + 
        theme_bw() + 
        theme(axis.text.y = element_blank(),
              axis.title.y = element_blank(),
              legend.position = "none")
    } else if(mode == "regression")
    {
      g8.df$fdr <- stats::p.adjust(g8.df$pval, method = "BH")
      g8.df$label <- ifelse(g8.df$fdr<0.05,"**", ifelse(g8.df$pval<0.05,"*",""))
      g8 <- ggplot(g8.df, aes(x=feature, y=rho, fill=rho)) + 
        geom_bar(stat = "identity") + 
        geom_text(aes(label=label)) + 
        scale_fill_gradient(low = "deepskyblue1", high = "firebrick1", limits=c(-1,1)) + 
        ylab("Spearman Rho vs. y var") + 
        ggtitle("") + 
        coord_flip() + 
        theme_bw() + 
        theme(axis.text.y = element_blank(),
              axis.title.y = element_blank(),
              legend.position = "none")
    }
    
    ##Build a a list with plots to visualize
    plot.list <- list("featprevFBM" = g1,
                      "featImp" = g6,
                      "effectSizes" = g8,
                      "featPrevGroups" = g7)
    # Save plotlist object if specified
    if(saveplotobj)
    {
      save(plot.list, file=paste("population features",name,".Rda", sep=""))
    }
    
    
    # plotting statements
    if(makeplot)
    {
      if(verbose) print(paste("Making plots in a dedicated pdf"))
      # Set the layout for patchwork plotting
      layout <- "
      AAABBCD
      "
      # Set the file name
      fname <- paste("population features",name,".pdf", sep="")
      # Build the patchwork plot
      fname.plot <- patchwork::wrap_plots(plot.list, design = layout)
      # Save the plot with cowplot::ggsave2
      cowplot::ggsave2(filename = fname, 
                       plot = fname.plot, 
                       width = as.numeric(pdf.dims[1]), 
                       height = as.numeric(pdf.dims[2]))
      # Return the plot
      return(patchwork::wrap_plots(plot.list, design = layout))
    }else
    {
      layout <- "
      AAABBCD
      "
      return(patchwork::wrap_plots(plot.list, design = layout))
      # return(plot.list)
    }
  }else # multiple experiments
  {
    lr <- clf_res
    feat.import <- mergeMeltImportanceCV(list.results = lr, 
                                         filter.cv.prev = filter.cv.prev, 
                                         min.kfold.nb = FALSE, 
                                         learner.grep.pattern = "*", 
                                         # nb.top.features = nb.top.features, 
                                         nb.top.features = nrow(X), 
                                         make.plot = TRUE)
    
    if(is.null(feat.import))
    {
      warning("analyzeImportantFeatures: These learners have no feature importance as implemented in the BTR ones. Sending an empty plot.")
      return(ggplot() + theme_void())
    }
    
    # Process the feat.import$summary to wide format (will be used for filtering features to plot)
    feat.import.dcast <- data.table::dcast(data = feat.import$summary, formula = feature~method, value.var="value")
    rownames(feat.import.dcast) <- feat.import.dcast$feature ; feat.import.dcast <- feat.import.dcast[,-1]
    feat.import.dcast$meanval <- rowMeans(feat.import.dcast, na.rm = TRUE)
    feat.import.dcast$feature <- factor(rownames(feat.import.dcast), levels=rownames(feat.import.dcast)[order(feat.import.dcast$meanval, decreasing = TRUE)])
    
    # # the most important features along with the order
    # features.import <- rev(levels(feat.import$summary$feature))
    
    ## get the FBM feature prevalence graphic
    #Get the population of best models
    pop.list <- lapply(clf_res, function(x){modelCollectionToPopulation(x[["classifier"]][["models"]])})
    #Select the best populations (FBM)
    pop.list.fbm <- list()
    for(i in names(pop.list))
    {
      pop.list.fbm[[i]] <- selectBestPopulation(pop = pop.list[[i]], 
                                                score = clf_res[[i]][["classifier"]][["params"]][["evalToFit"]],
                                                p = 0.05, 
                                                k_penalty = k_penalty, 
                                                k_max = k_max)
    }
    # get the population information into a dataframe
    pop.list.fbm.df <- lapply(pop.list.fbm, function(x){populationToDataFrame(pop = x)})
    # get the feature to model dataframe
    pop.list.fbm.noz <- list()
    for(i in names(pop.list.fbm))
    {
      pop.list.fbm.noz[[i]] <- listOfModelsToDenseCoefMatrix(clf = clf_res[[i]][["classifier"]], X = X, y = y, list.models = pop.list.fbm[[i]])
    }
    
    # make the feature annots
    pop.list.fbm.fa <- list()
    for(i in names(pop.list.fbm))
    {
      pop.list.fbm.fa[[i]] <- makeFeatureAnnot(pop = pop.list.fbm[[i]], X = X, y = y, clf = clf_res[[i]][["classifier"]])
    }
    
    #merge both pop.nozdfs
    pop.list.fbm.fa.popnoz.df <- list()
    for(i in names(pop.list.fbm.fa))
    {
      idf <- data.frame(pop.list.fbm.fa[[i]][["pop.noz"]])
      idf$feature <- rownames(idf) ; idf <- melt(idf)
      idf$learner <- unlist(lapply(strsplit(as.character(idf$variable), split="_"), function(x){x[1]}))
      idf$model <- as.character(unlist(lapply(strsplit(as.character(idf$variable), split="_"), function(x){x[2]})))
      idf$source <- paste(idf$learner,i, sep = ".")
      pop.list.fbm.fa.popnoz.df[[i]] <- idf
    }
    pop.list.fbm.fa.popnoz.df <- do.call("rbind", pop.list.fbm.fa.popnoz.df)
    pop.list.fbm.fa.popnoz.df$value <- factor(pop.list.fbm.fa.popnoz.df$value, levels=c(-1,0,1))
    pop.list.fbm.fa.popnoz.df$value <- droplevels(pop.list.fbm.fa.popnoz.df$value)
    
    #Do the selection of features from feat.import.dcast
    features.import <- feat.import.dcast[unique(pop.list.fbm.fa.popnoz.df$feature),]
    features.import$feature <- droplevels(features.import$feature)
    if(nrow(features.import)>nb.top.features)
    {
      print(paste(nrow(features.import), " features in FBM from ", length(clf_res), " experiments vs ", nb.top.features, " nb.top.features; select the ", nb.top.features, " top features with higher mean feature importance across experiments", sep = ""))
      features.import.vec <- levels(features.import$feature)[1:nb.top.features]
    } else
    {
      print(paste(nrow(features.import), " features in FBM from ", length(clf_res), "experiments vs ", nb.top.features, " nb.top.features; reporting all features in FBM", sep = ""))
      features.import.vec <- levels(features.import$feature)
    }
    
    # ----------------------------------
    # Build the feature prevalence plot
    # ----------------------------------
    g1.df <- pop.list.fbm.fa.popnoz.df[pop.list.fbm.fa.popnoz.df$feature %in% features.import.vec,]
    g1.df$feature <- factor(g1.df$feature, levels=rev(features.import.vec))
    g1.col <- c("deepskyblue1","white","firebrick1")
    names(g1.col) <- c("-1","0","1")
    g1.col <- g1.col[levels(g1.df$value)]
    g1 <- ggplot(g1.df, aes(x=model, y=feature, fill=value)) + 
      geom_tile(colour=NA) + 
      facet_grid(.~source, scales = "free_x", space = "free_x") + 
      scale_fill_manual(values = g1.col) + 
      xlab("FBM") + 
      theme_classic() + 
      theme(legend.position = "none",
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank())
    
    # ----------------------------------
    # get the importance graphic
    # ----------------------------------
    g6.df <- feat.import$summary
    g6.df <- g6.df[g6.df$feature %in% features.import.vec,]
    g6.df$feature <- factor(g6.df$feature, levels=rev(features.import.vec))
    g6.df$sign <- factor(g6.df$sign, levels=c("-1","0","1"))
    g6.df$sign <- droplevels(g6.df$sign)
    #Add the learner from the clf list
    g6.df.learner <- data.frame(sapply(clf_res, function(x){x[["classifier"]][["learner"]]}))
    colnames(g6.df.learner) <- "learner"
    g6.df <- merge(g6.df, g6.df.learner, by.x="method", by.y=0, all.x=TRUE)
    g6.df$learner <- paste(g6.df$learner, g6.df$method, sep = ".")
    g6.col <- c("deepskyblue1","white","firebrick1")
    names(g6.col) <- c("-1","0","1")
    g6.col <- g6.col[levels(g6.df$sign)]
    g6 <- ggplot(g6.df, aes(x=feature, y=value)) + 
      # the prevalence data on the bottom
      # geom_bar(data = fprev.melt, stat="identity", position="identity", aes(fill = "1")) +
      geom_hline(yintercept = min(0, g6.df$value, na.rm = TRUE), col="gray") +
      # ylab("Feature importance & prevalence (CV)") +
      ylab("Feature importance") +
      xlab("") +
      facet_grid(.~learner, scales = "free") + 
      theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
      theme_bw() +
      coord_flip() +
      # scale_fill_manual("Dataset", values = c("gray90","gray90")) +
      scale_color_manual("Dataset", values = g6.col) +
      # the importance data 
      geom_errorbar(aes(ymin = value - se, ymax = value + se, color = sign), width=.1, position=position_dodge(0.3)) +
      #geom_line(aes(group = feature, color = sign), position=pd) +
      geom_point(position = position_dodge(0.3), size=2, shape=19, aes(color = sign)) + # 21 is filled circle
      guides(colour = "none", fill = "none")
    g6 <- g6 + theme(axis.text.y = element_blank(), 
                     strip.background = element_rect(fill = NA))
    
    # ----------------------------------
    # get the prevalence graphic
    # ----------------------------------
    if(mode == "regression")
    {
      g7 <- plotPrevalence(features = features.import.vec, X, y = NULL)
    }else
    {
      g7 <- plotPrevalence(features = features.import.vec, X, y)
    }
    g7 <- g7 + theme(axis.title.y=element_blank(),
                     axis.text.y=element_blank(),
                     axis.ticks.y=element_blank())
    
    # ----------------------------------
    # get the effect size graphic
    # ----------------------------------
    g8.df <- computeEffectSizes(X = X, y = y, mode = mode)
    #subset g8 to features.import
    g8.df <- g8.df[g8.df$feature %in% features.import.vec,]
    g8.df$feature <- factor(g8.df$feature, levels=rev(features.import.vec))
    #add FDR adjustment + labels
    if(mode == "classification")
    {
      g8.df$fdr <- stats::p.adjust(g8.df$pval.wilcox, method = "BH")
      g8.df$label <- ifelse(g8.df$fdr<0.05,"**", ifelse(g8.df$pval.wilcox < 0.05,"*",""))
      g8 <- ggplot(g8.df, aes(x=feature, y=cdelta, fill=cdelta)) + 
        geom_bar(stat = "identity") + 
        geom_text(aes(label=label)) + 
        scale_fill_gradient(low = "deepskyblue1", high = "firebrick1", limits = c(-1,1)) + 
        ylab("Cliff's delta 1 vs. -1") + 
        ggtitle("") + 
        coord_flip() + 
        theme_bw() + 
        theme(axis.text.y = element_blank(),
              axis.title.y = element_blank(),
              legend.position = "none")
    } else if(mode == "regression")
    {
      g8.df$fdr <- stats::p.adjust(g8.df$pval, method = "BH")
      g8.df$label <- ifelse(g8.df$fdr<0.05,"**", ifelse(g8.df$pval<0.05,"*",""))
      g8 <- ggplot(g8.df, aes(x=feature, y=rho, fill=rho)) + 
        geom_bar(stat = "identity") + 
        geom_text(aes(label=label)) + 
        scale_fill_gradient(low = "deepskyblue1", high = "firebrick1", limits = c(-1,1)) + 
        ylab("Spearman Rho vs. y var") + 
        ggtitle("") + 
        coord_flip() + 
        theme_bw() + 
        theme(axis.text.y = element_blank(),
              axis.title.y = element_blank(),
              legend.position = "none")
    }
    
    ##Build a a list with plots to visualize
    plot.list <- list("featprevFBM"=g1,
                      "featImp"=g6,
                      "effectSizes"=g8,
                      "featPrevGroups"=g7)
    #Save the plotobject if specified
    if(saveplotobj)
    {
      save(plot.list, file=paste("population features",name,".Rda", sep=""))
    }
    #Plotting
    if(makeplot)
    {
      if(verbose) print(paste("Making plots in a dedicated pdf"))
      #Set the layout for patchwork plotting
      layout <- "
      AAABBCD
      "
      #Set the file name
      fname <- paste("population features",name,".pdf", sep="")
      #Build the patchwork plot
      fname.plot <- patchwork::wrap_plots(plot.list, design = layout)
      #Save the plot with cowplot::ggsave2
      cowplot::ggsave2(filename = fname, 
                       plot = fname.plot, width = as.numeric(pdf.dims[1]), height = as.numeric(pdf.dims[2]))
      #Return the plot
      return(patchwork::wrap_plots(plot.list, design = layout))
    } else
    {
      layout <- "
      AAABBCD
      "
      return(patchwork::wrap_plots(plot.list, design = layout))
    }
  } # end multiple experiments
}


#' Compute Effect Sizes for Features
#'
#' This function computes effect sizes for each feature in the feature matrix
#' `X`, using different methods based on the specified mode (`classification` or
#' `regression`). For classification tasks, it calculates Cliff's delta and
#' performs a Wilcoxon test for each feature, assuming binary classes `1` and
#' `-1`. For regression tasks, it computes Spearman's rank correlation
#' coefficient.
#'
#' @param X A feature matrix with rows representing features and columns
#'   representing samples.
#' @param y The response variable, either a binary factor for classification
#'   (with levels `1` and `-1`) or a continuous variable for regression.
#' @param mode A character string indicating the task type, either
#'   `"classification"` or `"regression"`.
#'
#' @return A data frame with effect size metrics for each feature:
#'   - For `classification` mode: columns `feature`, `cdelta` (Cliff's delta), and `pval.wilcox` (p-value from Wilcoxon test).
#'   - For `regression` mode: columns `feature`, `rho` (Spearman's correlation coefficient), and `pval` (p-value).
#'
#' @details
#' **Classification Mode**:
#'   - Checks that `y` contains only binary values (`1` and `-1`).
#'   - Calculates Cliff's delta to measure effect size and performs a Wilcoxon rank-sum test to assess the significance of feature values between the two classes.
#'
#' **Regression Mode**:
#'   - Computes Spearman's rank correlation coefficient (`rho`) to assess monotonic relationships between each feature and the continuous response variable `y`.
#'
#' @examples
#' \dontrun{
#' # For classification
#' effect_sizes <- computeEffectSizes(X = my_data, y = my_labels, mode = "classification")
#'
#' # For regression
#' effect_sizes <- computeEffectSizes(X = my_data, y = my_continuous_response, mode = "regression")
#' }
#'
#' @importFrom effsize cliff.delta
#' @importFrom stats wilcox.test cor.test
#'
#' @export
computeEffectSizes <- function(X, y, mode)
{
  if(mode == "classification")
  {
    #Transform y to factor
    y.factor <- factor(y, levels=c(1,-1))
    ##Check that y is binary (1, -1)
    if(length(which(is.na(y.factor)))>0)
    {
      stop("y vector contains other levels than 1,-1; please check if classification task")
    }
    #Build vector of sample classes
    names(y.factor) <- colnames(X)
    y.factor <- data.frame(y.factor)
    #melt the X
    X.melt <- as.data.frame(X)
    X.melt$feature <- rownames(X) ; X.melt <- melt(X.melt)
    X.melt <- merge(X.melt, y.factor, by.x="variable", by.y=0, all.x=TRUE)
    # Do the univariate tests + cliff delta calculations
    X.melt.tests <- list()
    for(v in unique(X.melt$feature))
    {
      v.cdelta <- effsize::cliff.delta(X.melt$value[X.melt$feature %in% v]~X.melt$y.factor[X.melt$feature %in% v])
      v.wilcox <- stats::wilcox.test(X.melt$value[X.melt$feature %in% v]~X.melt$y.factor[X.melt$feature %in% v])
      X.melt.tests[[v]] <- data.frame(feature=v,
                                      cdelta=v.cdelta$estimate,
                                      pval.wilcox=v.wilcox$p.value)
    }
    X.melt.tests <- do.call("rbind", X.melt.tests)
    rownames(X.melt.tests) <- seq(1,nrow(X.melt.tests))
    return(X.melt.tests)
  } else if(mode == "regression")
  {
    y.cont <- y
    names(y.cont) <- colnames(X)
    y.cont <- data.frame(y.cont)
    #melt the X
    X.melt <- as.data.frame(X)
    X.melt$feature <- rownames(X) ; X.melt <- melt(X.melt)
    X.melt <- merge(X.melt, y.cont, by.x="variable", by.y=0, all.x=TRUE)
    #Do the univariate tests + cliff delta calculations
    X.melt.tests <- list()
    for(v in unique(X.melt$feature))
    {
      v.spearman <- stats::cor.test(X.melt$value[X.melt$feature %in% v], X.melt$y.cont[X.melt$feature %in% v], method = "spearman")
      X.melt.tests[[v]] <- data.frame(feature=v,
                                      rho=v.spearman$estimate,
                                      pval=v.spearman$p.value)
    }
    X.melt.tests <- do.call("rbind", X.melt.tests)
    rownames(X.melt.tests) <- seq(1,nrow(X.melt.tests))
    return(X.melt.tests)
  } else
  {
    stop("mode not regression nor classification")
  }
}

#' Extract Important Features and Related Metrics from Final Population
#'
#' This function processes the final population of models from a given
#' classifier experiment (`clf_res`), selects the best population based on
#' specified criteria, and computes the feature importance, prevalence, and
#' effect sizes. The function returns a list of objects that can be used for
#' plotting or further analysis of the most relevant features in the model.
#'
#' @param clf_res A classifier experiment result, as produced by the modeling
#'   function.
#' @param X A feature matrix with rows representing features and columns
#'   representing samples.
#' @param y A response variable, either a binary factor (for classification) or
#'   a continuous variable (for regression).
#' @param verbose Logical. If `TRUE`, print detailed messages.
#' @param filter.cv.prev Numeric threshold for filtering based on
#'   cross-validation prevalence (default is 0.25).
#' @param scaled.importance Logical. If `TRUE`, scales the feature importance
#'   scores.
#' @param k_penalty A numeric penalty factor applied to sparsity selection
#'   during model evaluation (default is `0.75/100`).
#' @param k_max Maximum allowed sparsity value during model selection (default
#'   is 0).
#'
#' @return A list with the following components:
#'   - `featprevFBM`: A data frame containing feature prevalence data.
#'   - `featImp`: A summary of feature importance across cross-validation folds.
#'   - `effectSizes`: A data frame with effect sizes for each feature (Cliff’s delta for classification or Spearman's rho for regression).
#'   - `featPrevGroups`: Data used for plotting feature prevalence by group.
#'
#' @details
#' **Workflow**:
#'   - Determines if the experiment is regression or classification based on the classifier's objective.
#'   - Filters the best models from the population based on sparsity and evaluation criteria.
#'   - Constructs data structures that capture the feature importance, prevalence, and effect sizes.
#'   - Returns a list of data objects for easy plotting or further analysis.
#'
#' **Requirements**:
#'   - `isExperiment` should be a function that checks if `clf_res` is a valid experiment.
#'   - `modelCollectionToPopulation`, `selectBestPopulation`, and other helper functions should be defined for processing population and feature data.
#'
#' @examples
#' \dontrun{
#' # Extract feature importance and related metrics
#' feature_data <- getImportanceFeaturesFBMobjects(clf_res = my_experiment, X = my_data, y = my_labels)
#' }
#'
#' @export
getImportanceFeaturesFBMobjects <- function(clf_res, 
                                            X, 
                                            y, 
                                            verbose = TRUE, 
                                            filter.cv.prev = 0.25, 
                                            scaled.importance = FALSE, 
                                            k_penalty = 0.75/100,
                                            k_max = 0)
{
  mode <- NULL
  if(!isExperiment(clf_res))
  {
    stop("analyzeLearningFeatures: please provide a valid experiment results!")  
  }
  if(clf_res$classifier$params$objective == "cor")
  {
    mode <- "regression"
  }else
  {
    mode <- "classification"
  }
  
  if(!is.null(mode)) 
  {
    cat(paste("... Estimating mode: ", mode,"\n"))
  }else
  {
    stop("analyzeImportanceFeatures: mode not founding stopping ...")
  }
  #############
  # get the final population
  #############
  pop <- modelCollectionToPopulation(clf_res$classifier$models)
  if(verbose) print(paste("There are",length(pop), "models in this population"))
  # select the best population
  pop <- selectBestPopulation(pop = pop, score = clf_res$classifier$params$evalToFit, p = 0.05, k_penalty = k_penalty, k_max = k_max)
  if(verbose) print(paste("There are",length(pop), "models in this population after selection of the best"))
  
  if(length(pop) == 1)
  {
    stop("analyzeImportanceFeatures: only one model after filtering. Plot can not be built... returing empty handed.")
  }
  
  # get the population information into a dataframe
  pop.df <- populationToDataFrame(pop = pop)
  # get the feature to model dataframe
  pop.noz <- listOfModelsToDenseCoefMatrix(clf = clf_res$classifier, X = X, y = y, list.models = pop)
  if(verbose) print(paste("Pop noz object is created with", nrow(pop.noz), "features and", ncol(pop.noz), "models"))
  # make the feature annots + get the data in plotting format
  fa <- makeFeatureAnnot(pop = pop, X = X, y = y, clf = clf_res$classifier)
  pop.noz <- fa$pop.noz
  pop.noz <- data.frame(pop.noz)
  pop.noz$feature <- rownames(pop.noz) ; pop.noz <- melt(pop.noz)
  pop.noz$learner <- unlist(lapply(strsplit(as.character(pop.noz$variable), split="_"), function(x){x[1]}))
  #add the language from clf object
  pop.noz$learner <- paste(pop.noz$learner, clf_res$classifier$params$language, sep = ".")
  pop.noz$model <- as.character(unlist(lapply(strsplit(as.character(pop.noz$variable), split="_"), function(x){x[2]})))
  pop.noz$value <- factor(pop.noz$value, levels = c(-1,0,1))
  pop.noz$value <- droplevels(pop.noz$value)
  
  ########
  # get the feature importance information if it exists; do it for all features in X (subsequent filtering from FBM)
  ########
  lr <- list(clf_res)
  names(lr) <- paste(clf_res$classifier$learner, clf_res$classifier$params$language, sep=".")
  feat.import <- mergeMeltImportanceCV(list.results = lr, 
                                       filter.cv.prev = filter.cv.prev, 
                                       min.kfold.nb = FALSE, 
                                       learner.grep.pattern = "*", 
                                       # nb.top.features = nb.top.features, 
                                       nb.top.features = nrow(X),
                                       feature.selection = NULL,
                                       scaled.importance = scaled.importance,
                                       make.plot = TRUE)
  if(is.null(feat.import))
  {
    stop("analyzeImportanceFeatures: no feature importance data found... returning empty handed.")
  }
  
  # get the prevalence graphic object
  if(mode == "regression")
  {
    featPrevPlot <- plotPrevalence(features = rownames(X), X, y = NULL)
  }else
  {
    featPrevPlot <- plotPrevalence(features = rownames(X), X, y)
  }
  # get the effect size graphic
  effSizes.df <- computeEffectSizes(X = X, y = y, mode = mode)
  
  ##Build a a list with objects to save
  outlist <- list("featprevFBM"=pop.noz,
                  "featImp"=feat.import$summary,
                  "effectSizes"=effSizes.df,
                  "featPrevGroups"=featPrevPlot$data)
  ## Add specific class label to outlist for subsequent checking in plotImportanceFeaturesFBMobjects
  class(outlist) <- "listFBMfeatures"
  return(outlist)
}


#' Plot Feature Importance, Prevalence, and Effect Sizes from Final Models
#'
#' This function takes a list of `listFBMfeatures` objects (produced by the
#' `getImportanceFeaturesFBMobjects` function), and generates a series of plots
#' visualizing the feature importance, prevalence across models, effect sizes,
#' and feature prevalence across groups. The plots are generated for the top
#' features, based on their importance, and saved in a PDF file.
#'
#' @param FBMobjList A list of `listFBMfeatures` objects, typically produced by
#'   `getImportanceFeaturesFBMobjects`.
#' @param verbose Logical. If `TRUE`, print detailed messages.
#' @param nb.top.features Integer. The number of top features to display in the
#'   plots. Default is 100.
#' @param makeplot Logical. If `TRUE`, generate the plots. If `FALSE`, return
#'   the plot objects without saving.
#' @param pdf.dims A vector of two numbers specifying the width and height of
#'   the PDF (default is `c(width = 25, height = 20)`).
#'
#' @return If `makeplot` is `TRUE`, a PDF file is created with the feature
#'   importance and prevalence plots. Otherwise, a list of plot objects is
#'   returned.
#'
#' @details
#' **Workflow**:
#'   - Extracts feature importance, prevalence, and effect size data from the given `FBMobjList`.
#'   - Selects the top features based on their importance across multiple datasets.
#'   - Generates four key visualizations:
#' 1. Feature prevalence in the final population of models. 2. Feature
#' importance across models. 3. Effect sizes (Cliff's delta for classification
#' or Spearman's rho for regression). 4. Feature prevalence across groups (e.g.,
#' -1, 1 classes for classification tasks).
#'   - If `makeplot` is `TRUE`, the plots are saved to a PDF file.
#'
#' **Requirements**:
#'   - The input `FBMobjList` must be a list of `listFBMfeatures` objects, as produced by `getImportanceFeaturesFBMobjects`.
#'   - Helper functions for plotting such as `plotPrevalence` and `computeEffectSizes` must be defined.
#'
#' @examples
#' \dontrun{
#' # Plot feature importance and related metrics
#' plotImportanceFeaturesFBMobjects(FBMobjList = my_FBM_objects, makeplot = TRUE)
#' }
#'
#' @export
plotImportanceFeaturesFBMobjects <- function(FBMobjList, 
                                             verbose = TRUE, 
                                             nb.top.features = 100,
                                             makeplot = TRUE)
{
  #check that elements in FBMobjList are of class listFBMfeatures
  FBMobjList.check <- sapply(FBMobjList, function(x){class(x)=="listFBMfeatures"})
  if(sum(FBMobjList.check)!=length(FBMobjList))
  {
    FBMobjList.check.wrong <- names(FBMobjList.check)[!FBMobjList.check]
    stop(print(paste(paste(FBMobjList.check.wrong, collapse = ","), 
                     "elements in FBMobjList not produced by getImportanceFeaturesFBMobjects (not of class listFBMfeatures)"))) 
  }
  #get the feature importance data + add dataset source
  FBMobjList.fimp <- lapply(FBMobjList, function(x){x[["featImp"]]})
  for(i in names(FBMobjList.fimp))
  {
    FBMobjList.fimp[[i]][,"dataset"] <- i
  }
  #Get the FBM dataframes + the feature importances of the corresponding features
  FBMobjList.fbm <- lapply(FBMobjList, function(x){x[["featprevFBM"]]})
  for(i in names(FBMobjList.fbm))
  {
    FBMobjList.fbm[[i]][,"dataset"] <- i
  }
  #get the unique features in FBMobjList.fbm
  FBMobjList.fbm_features <- unique(unlist(lapply(FBMobjList.fbm, function(x){x[,"feature"]})))
  
  #get the features to plot
  FBMobjList.fimp.df <- do.call("rbind", FBMobjList.fimp)
  FBMobjList.fimp.df.dcast <- dcast(data = FBMobjList.fimp.df, formula = feature~dataset, value.var="value")
  rownames(FBMobjList.fimp.df.dcast) <- FBMobjList.fimp.df.dcast[,1] ; FBMobjList.fimp.df.dcast <- FBMobjList.fimp.df.dcast[,-1,drop=FALSE]
  #compute the average of feat importance across datasets
  FBMobjList.fimp.df.dcast$avg <- rowSums(FBMobjList.fimp.df.dcast, na.rm = TRUE)/ncol(FBMobjList.fimp.df.dcast)
  tfeats <- rownames(FBMobjList.fimp.df.dcast)
  tfeats.fbm <- intersect(tfeats, FBMobjList.fbm_features)
  
  if(length(tfeats.fbm)>nb.top.features)
  {
    if(verbose)
    {
      print(paste(length(tfeats.fbm), " unique features in FBM of combined datasets with feature importance vs. ",nb.top.features, " nb.top.features; selecting ", nb.top.features, " features with highest mean feature importance across datasets"))
    }
    tfeats.fbm <- FBMobjList.fimp.df.dcast[tfeats.fbm,]
    tfeats.fbm <- rownames(tfeats.fbm)[order(tfeats.fbm$avg, decreasing = TRUE)]
    tfeats.fbm <- tfeats.fbm[1:nb.top.features]
  } else
  {
    if(verbose)
    {
      print(paste(length(tfeats.fbm), " unique features in FBM of combined datasets with fefature importance vs. ",nb.top.features, " nb.top.features; keeping all features in FBM of combined datasets"))
    }
    tfeats.fbm <- FBMobjList.fimp.df.dcast[tfeats.fbm,]
    tfeats.fbm <- rownames(tfeats.fbm)[order(tfeats.fbm$avg, decreasing = TRUE)]
  }
  ### Visu prevalence in FBM
  FBMobjList.fbm_plotdf <- lapply(FBMobjList.fbm, function(x){x[x[,"feature"] %in% tfeats.fbm,]})
  FBMobjList.fbm_plotdf <- do.call("rbind", FBMobjList.fbm_plotdf)
  FBMobjList.fbm_plotdf$feature <- factor(FBMobjList.fbm_plotdf$feature, levels = rev(tfeats.fbm))
  plot1.cols <- c("deepskyblue1","white","firebrick1")
  names(plot1.cols) <- c("-1","0","1")
  plot1.cols <- plot1.cols[levels(FBMobjList.fbm_plotdf$value)]
  plot1 <- ggplot(FBMobjList.fbm_plotdf, aes(x=model, y=feature, fill=value)) + 
    geom_tile(colour=NA) + 
    facet_grid(.~dataset, scales = "free_x", space = "free_x") + 
    scale_fill_manual(values = plot1.cols) + 
    xlab("FBM") + 
    theme_classic() + 
    theme(legend.position = "none",
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank())
  ### Visu the feature importance
  FBMobjList.fimp_plot <- lapply(FBMobjList.fimp, function(x){x[x[,"feature"] %in% tfeats.fbm,]})
  FBMobjList.fimp_plot <- do.call("rbind", FBMobjList.fimp_plot)
  FBMobjList.fimp_plot$feature <- factor(FBMobjList.fimp_plot$feature, levels=rev(tfeats.fbm))
  FBMobjList.fimp_plot$sign <- factor(FBMobjList.fimp_plot$sign, levels=c("-1","1"))
  plot2.col <- c("deepskyblue1","white","firebrick1")
  names(plot2.col) <- c("-1","0","1")
  plot2.col <- plot2.col[levels(FBMobjList.fimp_plot$sign)]
  plot2 <- ggplot(FBMobjList.fimp_plot, aes(x=feature, y=value)) + 
    # the prevalence data on the bottom
    # geom_bar(data = fprev.melt, stat="identity", position="identity", aes(fill = "1")) +
    geom_hline(yintercept = min(0, FBMobjList.fimp_plot$value, na.rm = TRUE), col="gray") +
    # ylab("Feature importance & prevalence (CV)") +
    ylab("Feature importance") +
    xlab("") +
    facet_grid(.~dataset) + 
    theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
    theme_bw() +
    coord_flip() +
    # scale_fill_manual("Dataset", values = c("gray90","gray90")) +
    scale_color_manual("Dataset", values = plot2.col) +
    # the importance data 
    geom_errorbar(aes(ymin = value - se, ymax = value + se, color = sign), width=.1, position=position_dodge(0.3)) +
    #geom_line(aes(group = feature, color = sign), position=pd) +
    geom_point(position = position_dodge(0.3), size=2, shape=19, aes(color = sign)) + # 21 is filled circle
    guides(colour = "none", fill = "none")
  plot2 <- plot2 + theme(axis.text.y = element_blank(),
                         strip.background = element_rect(fill = NA))
  
  ### Visu the feature effectsizes
  FBMobjList.effsize <- lapply(FBMobjList, function(x){x[["effectSizes"]]})
  #subset to tfeats
  FBMobjList.effsize <- lapply(FBMobjList.effsize, function(x){x[x[,"feature"] %in% tfeats.fbm,]})
  #fdr adjustment of pvalues
  for(i in names(FBMobjList.effsize))
  {
    idf <- FBMobjList.effsize[[i]]
    idf[,"fdr"] <- p.adjust(idf[,3], method = "BH")
    idf$dataset <- i
    FBMobjList.effsize[[i]] <- idf
  }
  FBMobjList.effsize <- do.call("rbind", FBMobjList.effsize)
  FBMobjList.effsize$label <- ifelse(FBMobjList.effsize[,4]<0.05,"**", ifelse(FBMobjList.effsize[,3]<0.05,"*",""))
  FBMobjList.effsize$feature <- factor(FBMobjList.effsize$feature, levels=rev(tfeats.fbm))
  plot3 <- ggplot(FBMobjList.effsize, aes(x=feature, y=cdelta, fill=cdelta)) + 
    geom_bar(stat = "identity") + 
    geom_text(aes(label=label)) + 
    scale_fill_gradient(low = "deepskyblue1", high = "firebrick1", limits=c(-1,1)) + 
    ylab("Cliff's delta 1 vs. -1") + 
    ggtitle("") + 
    facet_grid(.~dataset) + 
    coord_flip() + 
    theme_bw() + 
    theme(axis.text.y = element_blank(),
          axis.title.y = element_blank(),
          strip.background = element_rect(fill = NA),
          legend.position = "none")
  #Visu the feature prevalence across groups
  FBMobjList.fprev <- lapply(FBMobjList, function(x){x[["featPrevGroups"]]})
  FBMobjList.fprev <- lapply(FBMobjList.fprev, function(x){x[x[,"feature"] %in% tfeats.fbm,]})
  for(i in names(FBMobjList.fprev))
  {
    FBMobjList.fprev[[i]][,"dataset"] <- i
  }
  FBMobjList.fprev <- do.call("rbind", FBMobjList.fprev)
  FBMobjList.fprev$feature <- factor(FBMobjList.fprev$feature, levels = rev(tfeats.fbm))
  col.pt = c("deepskyblue4", "firebrick4") 
  col.bg = c("deepskyblue1", "firebrick1")
  plot4 <- ggplot(FBMobjList.fprev, aes(x=feature, y=prevalence, fill=group)) + 
    geom_bar(data=subset(FBMobjList.fprev, group %in% c("all")), stat="identity", position="identity") + 
    facet_grid(.~dataset) + 
    coord_flip() + 
    geom_point(data = subset(FBMobjList.fprev, group %in% c("-1", "1")), aes(x=feature, y=prevalence, color=group, shape=group)) + 
    scale_color_manual("Dataset", values = c("all"="gray90", "-1"=col.pt[1], "1"=col.pt[2])) +
    scale_fill_manual("Dataset", values = c("all"="gray90", "-1"=col.bg[1], "1"=col.bg[2])) +
    scale_shape_manual(values=c(25,24)) + 
    theme_bw() + 
    theme(legend.position="none", axis.text=element_text(size=9),
          axis.text.y = element_blank(),
          axis.title.y = element_blank(),
          strip.background = element_rect(fill = NA)) + 
    ggtitle("") 
  ##Build a a list with plots to visualize
  plot.list <- list("featprevFBM"=plot1,
                    "featImp"=plot2,
                    "effectSizes"=plot3,
                    "featPrevGroups"=plot4)
  if(makeplot)
  {
    if(verbose) print(paste("Making plots in a dedicated pdf"))
    #Set the layout for patchwork plotting
    layout <- "
      AAABBCD
      "
    #Set the file name
    fname <- paste("population features multipleExperiments",name,".pdf", sep="")
    #Build the patchwork plot
    fname.plot <- patchwork::wrap_plots(plot.list, design = layout)
    #Save the plot with cowplot::ggsave2
    cowplot::ggsave2(filename = fname, 
                     plot = fname.plot, width = as.numeric(pdf.dims[1]), height = as.numeric(pdf.dims[2]))
    #Return the plot
    return(patchwork::wrap_plots(plot.list, design = layout))
  }else
  {
    layout <- "
      AAABBCD
      "
    return(patchwork::wrap_plots(plot.list, design = layout))
  }
}
predomics/predomicspkg documentation built on Dec. 11, 2024, 11:06 a.m.