R/workflowImportanceHP.R

Defines functions workflowImportanceHP

Documented in workflowImportanceHP

#' Computes the contribution to dissimilarity of each variable using workflowPsiHP.
#'
#' @description This workflow executes the following steps:
#' \itemize{
#' \item computes \code{psi} as done by \code{\link{workflowPsi}}.
#' \item computes \code{psi} as many times as numeric variables in \code{sequences}, removing one of them each time (jacknife analysis) to compute the relative contribution of each variable to overall dissimilarity.
#' \item Delivers an output of type "list" with two slots:
#' \itemize{
#' \item \code{psi} a dataframe with the columns "A" and "B" with the respective names of the sequences compared, a column named "All variables" with the psi values of each pair of sequences computed by considering all variables, and then one column per variable, indicating the \code{psi} value when that variable is removed.
#' \item \code{psi.drop} a dataframe with the columns "A" and "B", and then one column per numeric variable in \code{sequences} indicating the percentage of drop in \code{psi} (as indicated by the "All variables" column in the psi dataframe) when the given variable is removed. Positive values indicate that the given variable reduces dissimilarity when removed, making the sequences more similar, while negative values indicate that the variable increases dissimilarity when removed, making the sequences more different.
#' }
#' }
#'
#' @details If we consider the question "what variable contributes the most to the dissimilarity between two sequences?" the answer "the one dropping dissimilarity the most when excluded from the analysis" sounds like a reasonable answer. This workflow attempts to reach that answer by computing \code{psi} while removing one variable at a time.
#'
#' @usage workflowImportanceHP(
#'   sequences = NULL,
#'   grouping.column = NULL,
#'   time.column = NULL,
#'   exclude.columns = NULL,
#'   parallel.execution = TRUE
#'   )
#'
#' @param sequences dataframe with multiple sequences identified by a grouping column generated by \code{\link{prepareSequences}}.
#' @param grouping.column character string, name of the column in \code{sequences} to be used to identify separates sequences within the file.
#' @param time.column character string, name of the column with time/depth/rank data.
#' @param exclude.columns character string or character vector with column names in \code{sequences} to be excluded from the analysis.
#' @param parallel.execution boolean, if \code{TRUE} (default), execution is parallelized, and serialized if \code{FALSE}.
#'
#' @return A list with two slots named \emph{psi} and \emph{psi.drop}. The former contains the dissimilarity values when removing each variable, while the latter contains the drop in dissimilarity (as a percentage of psi computed on all variables) that happens when each variable is removed. Positive values indicate that dissimilarity drops when the variable is removed, while negative values indicate that similarity drops when the variable is removed.
#'
#' @author Blas Benito <blasbenito@gmail.com>
#' @export
workflowImportanceHP <- function(
  sequences = NULL,
  grouping.column = NULL,
  time.column = NULL,
  exclude.columns = NULL,
  parallel.execution = TRUE
){

  #1. computing psi normally for all sequences
  #to generate the first column of the output dataframe
  ####################################################
    psi.df <- workflowPsiHP(
      sequences = sequences,
      grouping.column = grouping.column,
      time.column = time.column,
      exclude.columns = exclude.columns,
      parallel.execution = parallel.execution
  )
  names(psi.df)[3] <- "All variables"

  #2 computing psi for each column
  #########################################################
  #selecting target columns
  sequence.columns <- colnames(sequences)[!(colnames(sequences) %in% c(time.column, grouping.column, exclude.columns))]
  if(length(sequence.columns) < 2){stop("Only one column is available, a variable importance analysis is not possible.")}

  #generating column combinations
  remove.columns <- utils::combn(sequence.columns, m=length(sequence.columns) - 1)

  #getting the selected column for each iteration
  target.columns <- vector()
  for(i in 1:ncol(remove.columns)){
    target.columns[i] <- sequence.columns[!(sequence.columns %in% remove.columns[,i])]
  }

  #stop if number of elements is different
  if(length(target.columns) != ncol(remove.columns)){
    stop("There is something wrong with the columns selection")
    } else {
    n.iterations <- ncol(remove.columns)
  }

  #parallel execution = TRUE
  if(parallel.execution == TRUE){
    `%dopar%` <- foreach::`%dopar%`
    n.cores <- parallel::detectCores() - 1
    if(n.iterations < n.cores){n.cores <- n.iterations}

    if(.Platform$OS.type == "windows"){
      my.cluster <- parallel::makeCluster(n.cores, type="PSOCK")
    } else {
      my.cluster <- parallel::makeCluster(n.cores, type="FORK")
    }

    doParallel::registerDoParallel(my.cluster)

  #exporting cluster variables
  parallel::clusterExport(cl=my.cluster,
                          varlist=c('sequences',
                                    'target.columns',
                                    'exclude.columns',
                                    'remove.columns',
                                    'workflowPsiHP',
                                    'n.iterations',
                                    'psi.df'),
                          envir=environment()
  )
  } else {
    #replaces dopar (parallel) by do (serial)
    `%dopar%` <- foreach::`%do%`
    on.exit(`%dopar%` <- foreach::`%dopar%`)
  }


  #2: computing psi without the given column
  ##############################################################
  psi.without <- foreach::foreach(i = 1:n.iterations) %dopar% {

      #psi for non paired samples
      psi.i <- workflowPsiHP(
        sequences = sequences,
        grouping.column = grouping.column,
        time.column = time.column,
        exclude.columns = c(target.columns[i], exclude.columns),
        parallel.execution = FALSE
      )

    return(psi.i)

  } #end of parallelized loop

  #names of the new columns
  target.columns.names.without <- paste("Without", target.columns, sep=" ")
  for(i in 1:n.iterations){
    psi.df[,target.columns.names.without[i]] <- psi.without[[i]]$psi
  }

  #stopping cluster
  if(parallel.execution == TRUE){
    parallel::stopCluster(my.cluster)
  } else {
    #creating the correct alias again
    `%dopar%` <- foreach::`%dopar%`
  }

  #3: preparing output
  ######################################
  #drop in dissimilarity when removing a given variable as percentage
  psi.drop <- ((psi.df[, "All variables"] - psi.df[, target.columns.names.without]) * 100) / psi.df[, "All variables"]
  colnames(psi.drop) <- target.columns
  psi.drop <- round(psi.drop, 2)
  psi.drop <- cbind(psi.df[, 1:2], psi.drop)

  #preparing output list
  output.list <- list()
  output.list$psi <- psi.df
  output.list$psi.drop <- psi.drop

  #returning the list
  return(output.list)

} #end of workflow

Try the distantia package in your browser

Any scripts or data that you put into this service are public.

distantia documentation built on Oct. 30, 2019, 10:05 a.m.