Rnalytica: An R package of JIRA defect datasets and tool suites for explainable software analytics

Documented in get.automated.spearman

#' Automated Spearman rank correlation test
#'
#' This function automatically select non-correlated metrics based on a Spearman rank correlation test. To do so, we start from the pair of the strongest correlated metrics. Since these two metrics can be linearly predicted with each other, one of these two metrics must be removed while selecting the other. We select the metric that has the lowest Spearman correlation coefficient with the other metrics that are not in the pair. We repeat this process until all metrics have their Spearman correlation coefficient below a threshold value (default = 0.7).
#'
#' @param dataset  a data frame for data
#' @param metrics  a characters or a vector of characters for independent variables
#' @param spearman.threshold a numeric for a threshold of Spearman rank correlation test (default = 0.7)
#' @param verbose  TRUE for printing
#' @importFrom Hmisc rcorr
#' @keywords Spearman


get.automated.spearman <- function(dataset, metrics, spearman.threshold, verbose = F){

  .get.higher.correlation <- function(index1, index2, metrics, metric.correlations, verbose = F, count){
    metric1 <- metrics[index1]
    metric2 <- metrics[index2]
    if(verbose)
      cat(paste0('Step ', count, ' - {', metric1, ', ', metric2, '} > '))# are correlated with r = ', metric.correlations[metric1, metric2], '\n'))
    metric1.correlation <- mean(metric.correlations[metric1, !metrics %in% c(metric1, metric2)])
    metric2.correlation <- mean(metric.correlations[metric2, !metrics %in% c(metric1, metric2)])
    if(metric1.correlation <= metric2.correlation){
      return(index2)
    } else {
      return(index1)
    }
  }

  metric.correlations <- abs(rcorr(as.matrix(dataset[, metrics]), type = 'spearman')$r)

  above.threshold <- which((metric.correlations >= spearman.threshold), arr.ind = TRUE)
  row.names(above.threshold) <- NULL
  above.threshold <- as.data.frame(above.threshold, row.names = NULL)
  above.threshold <- above.threshold[above.threshold$row != above.threshold$col, ]
  above.threshold$correlation <- 100
  for(i in 1:nrow(above.threshold)){
    above.threshold$correlation[i] <- metric.correlations[above.threshold$row[i], above.threshold$col[i]]
  }
  above.threshold <- above.threshold[order(-above.threshold$correlation), ]

  exclude.metrics <- {}
  count <- 1
  repeat{
    if(nrow(above.threshold) == 0)
      break
    tmp <- above.threshold[1, ]
    exclude.index <- .get.higher.correlation(tmp$row, tmp$col, metrics, metric.correlations, verbose, count)
    exclude.metrics <- c(exclude.metrics, metrics[exclude.index])
    if(verbose){
      cat(paste0(metrics[exclude.index], '\n'))
      count <- count + 1
    }
    above.threshold <- above.threshold[-which((above.threshold$row == exclude.index) | (above.threshold$col == exclude.index)), ]
  }
  selected.metrics <- metrics[!metrics %in% exclude.metrics]
  return(selected.metrics)
}

get.automated.spearman_Group <- function (dataset, metrics, spearman.threshold, verbose = F) 
{
  .get.higher.correlation <- function(index1, index2, metrics, 
                                      metric.correlations, verbose = F, count) {
    metric1 <- metrics[index1]
    metric2 <- metrics[index2]
    if (verbose) 
      cat(paste0("Step ", count, " - {", metric1, ", ", 
                 metric2, "} > "))
    metric1.correlation <- mean(metric.correlations[metric1, 
                                                    !metrics %in% c(metric1, metric2)])
    metric2.correlation <- mean(metric.correlations[metric2, 
                                                    !metrics %in% c(metric1, metric2)])
    if (metric1.correlation <= metric2.correlation) {
      return(index2)
    }
    else {
      return(index1)
    }
  }
  metric.correlations <- abs(rcorr(as.matrix(dataset[, metrics]), 
                                   type = "spearman")$r)
  above.threshold <- which((metric.correlations >= spearman.threshold), 
                           arr.ind = TRUE)
  row.names(above.threshold) <- NULL
  above.threshold <- as.data.frame(above.threshold, row.names = NULL)
  above.threshold <- above.threshold[above.threshold$row != 
                                       above.threshold$col, ]
  above.threshold$correlation <- 100
  for (i in 1:nrow(above.threshold)) {
    above.threshold$correlation[i] <- metric.correlations[above.threshold$row[i], 
                                                          above.threshold$col[i]]
  }
  above.threshold <- above.threshold[order(-above.threshold$correlation), 
                                     ]
  exclude.metrics <- {
  }
  count <- 1
  repeat {
    if (nrow(above.threshold) == 0) 
      break
    tmp <- above.threshold[1, ]
    exclude.index <- .get.higher.correlation(tmp$row, tmp$col, 
                                             metrics, metric.correlations, verbose, count)
    exclude.metrics <- c(exclude.metrics, metrics[exclude.index])
    
    # rename metric, correlation table, and df
    rc_index <- c(tmp$row, tmp$col)
    to_replace_index <- rc_index[rc_index != exclude.index]
    to_replace_metric <- metrics[rc_index[rc_index != exclude.index]]
    
    combined_metric <- paste0(metrics[rc_index], collapse = '__')
    metrics[to_replace_index] <- combined_metric
    row.names(metric.correlations)[row.names(metric.correlations) == to_replace_metric] <- combined_metric
    colnames(metric.correlations)[colnames(metric.correlations) == to_replace_metric] <- combined_metric
    names(dataset)[names(dataset) == to_replace_metric] <- combined_metric
    
    if (verbose) {
      cat(paste0(metrics[exclude.index], "\n"))
      count <- count + 1
    }
    above.threshold <- above.threshold[-which((above.threshold$row == 
                                                 exclude.index) | (above.threshold$col == exclude.index)), 
                                       ]
  }
  selected.metrics <- metrics[!metrics %in% exclude.metrics]
  return(list(dataset = dataset[, selected.metrics],
              selected.metrics = selected.metrics))
}

software-analytics/Rnalytica documentation built on Aug. 16, 2020, 9:38 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

software-analytics/Rnalytica
An R package of JIRA defect datasets and tool suites for explainable software analytics

R/get.automated.spearman.R
In software-analytics/Rnalytica: An R package of JIRA defect datasets and tool suites for explainable software analytics

Defines functions get.automated.spearman

Documented in get.automated.spearman

R Package Documentation

Browse R Packages

We want your feedback!

software-analytics/Rnalytica An R package of JIRA defect datasets and tool suites for explainable software analytics

R/get.automated.spearman.R In software-analytics/Rnalytica: An R package of JIRA defect datasets and tool suites for explainable software analytics

Defines functions get.automated.spearman

Documented in get.automated.spearman

R Package Documentation

Browse R Packages

We want your feedback!

software-analytics/Rnalytica
An R package of JIRA defect datasets and tool suites for explainable software analytics

R/get.automated.spearman.R
In software-analytics/Rnalytica: An R package of JIRA defect datasets and tool suites for explainable software analytics