# R/calculateStats.R In countsimQC: Compare Characteristic Features of Count Data Sets

#### Documented in calculateStatsdefaultStats

```#' Calculate statistics for pairwise comparison of data sets
#'
#' Calculate a range of statistics and p-values for comparison of two data sets.
#'
#' @param df The input data frame. Must contain at least a column named
#'   'dataset' and an additional column with values to use as the basis for the
#'   comparison.
#' @param ds1,ds2 The names of the two data sets to be compared.
#' @param column The name of the column(s) of \code{df} to be used as the basis
#'   for the comparison.
#' @param subsampleSize The number of observations for which certain
#'   time-consuming statistics will be calculated. The observations will be
#'   selected randomly among the rows of \code{df}.
#' @param permute Whether to permute the dataset column of \code{df} before
#'   calculating the statistics.
#' @param kmin,kfrac For statistics that require the extraction of k nearest
#'   neighbors of a given point, the number of neighbors will be max(kmin, kfrac
#'   * nrow(df)).
#' @param xmin,xmax Smallest and largest value of \code{column}, used to
#'   normalize the x-axis when calculating the area between the eCDFs.
#'
#' @return A vector with statistics and p-values
#'
#' @keywords internal
#' @author Charlotte Soneson
#'
#' @importFrom stats ks.test ecdf chisq.test
#'
calculateStats <- function(df, ds1, ds2, column, subsampleSize,
permute = FALSE, kmin, kfrac, xmin, xmax) {
## Remove rows with NA values in column(s) of interest
df <- df[rowSums(is.na(df[, column, drop = FALSE])) == 0, ]

## Permute dataset column if requested
if (permute) {
df\$dataset <- sample(df\$dataset, nrow(df))
}

if (length(column) == 1) {
## Kolmogorov-Smirnov test
kst <- stats::ks.test(df[, column][df\$dataset == ds1],
df[, column][df\$dataset == ds2])

## Area between ecdfs
e1 <- stats::ecdf(df[, column][df\$dataset == ds1])
e2 <- stats::ecdf(df[, column][df\$dataset == ds2])
xv <- sort(unique(df[, column]))
ediff <- abs(e1(xv) - e2(xv))
earea <- caTools::trapz((xv - xmin)/(xmax - xmin), ediff)
}

## Silhouette width and nearest neighbor distribution for subsample of
## observations
## Calculate overall proportion of values from the two data sets
overallprop <- table(factor(df\$dataset, levels = c(ds1, ds2)))
overallprop <- overallprop/sum(overallprop)
nrep <- 1
chisilh <- t(vapply(seq_len(nrep), function(m) {
## Select subset of observations for calculation of silhouette widths and
## nearest neighbor distributions
idx <- sample(seq_len(nrow(df)), min(nrow(df), subsampleSize))

## For each selected observation, calculate statistics
tmp <- t(vapply(idx, function(j) {

## Distances to all observations
dists <- sqrt(rowSums((sweep(df[, column, drop = FALSE],
2, unlist(df[j, column,
drop = FALSE]), "-")) ^ 2))
dists_this <- dists[df\$dataset == df\$dataset[j]]
dists_other <- dists[df\$dataset != df\$dataset[j]]

## Silhouette width
silh <- (mean(dists_other, na.rm = TRUE) -
mean(dists_this, na.rm = TRUE)) /
max(mean(dists_other, na.rm = TRUE), mean(dists_this, na.rm = TRUE))

## Local silhouette width
dists_this_local <-
sort(dists_this)[seq_len(max(kmin, kfrac * nrow(df)) *
overallprop[df\$dataset[j]])]
dists_other_local <-
sort(dists_other)[seq_len(max(kmin, kfrac * nrow(df)) *
(1 - overallprop[df\$dataset[j]]))]
silh_local <-
(mean(dists_other_local, na.rm = TRUE) -
mean(dists_this_local, na.rm = TRUE)) /
max(mean(dists_other_local, na.rm = TRUE),
mean(dists_this_local, na.rm = TRUE))
if (all(c(dists_this_local, dists_other_local) == 0)) {
silh_local <- 0
}

## Chi-square test comparing distribution of data set labels among
## nearest neighbors to the overall distribution
x <- df\$dataset[order(dists)][seq_len(max(kmin, kfrac * nrow(df)))]
chisqp <- stats::chisq.test(x = table(factor(x, levels = c(ds1, ds2))),
p = overallprop)\$p.value
## Return values
c(silh = silh, chisqp = chisqp, silh_local = silh_local)
}, c(silh = NA_real_, chisqp = NA_real_, silh_local = NA_real_)))
c(silh = mean(tmp[, "silh"], na.rm = TRUE),
silhlocal = mean(tmp[, "silh_local"], na.rm = TRUE),
chisqp = mean(tmp[, "chisqp"] <= 0.05, na.rm = TRUE))
}, c(silh = NA_real_, silhlocal = NA_real_, chisqp = NA_real_)))

## Runs test
if (length(column) == 1) {
df <- df %>% arrange_(column)
runs_res <- randtests::runs.test(as.numeric(df\$dataset == ds1),
threshold = 0.5,
alternative = "left.sided", plot = FALSE)
}

if (length(column) == 1)
c(ksstatistic = signif(kst\$statistic, 3),
kspvalue = signif(kst\$p.value, 3),
diffarea = signif(earea, 3),
runsstatistic = signif(runs_res\$statistic, 3),
runspvalue = signif(runs_res\$p.value, 3),
NNmismatch = signif(mean(chisilh[, "chisqp"]), 3),
avesilh = signif(mean(chisilh[, "silh"]), 3),
avesilhlocal = signif(mean(chisilh[, "silhlocal"]), 3))
else
c(NNmismatch = signif(mean(chisilh[, "chisqp"]), 3),
avesilh = signif(mean(chisilh[, "silh"]), 3),
avesilhlocal = signif(mean(chisilh[, "silhlocal"]), 3))
}

#' Return a vector of NA scores
#'
#' @param n Number of columns to use for the comparison
#' @param withP Whether or not to include p-value columns
#'
#' @return A vector with NA values for all applicable statistics
#'
#' @keywords internal
#' @author Charlotte Soneson
#'
defaultStats <- function(n, withP = FALSE) {
if (n == 1) {
if (withP) {
c(ksstatistic = NA_real_, kspvalue = NA_real_, diffarea = NA_real_,
runsstatistic = NA_real_, runspvalue = NA_real_, NNmismatch = NA_real_,
avesilh = NA_real_, avesilhlocal = NA_real_, NNmismatchP = NA_real_,
avesilhP = NA_real_, avesilhlocalP = NA_real_, diffareaP = NA_real_)
} else {
c(ksstatistic = NA_real_, kspvalue = NA_real_, diffarea = NA_real_,
runsstatistic = NA_real_, runspvalue = NA_real_, NNmismatch = NA_real_,
avesilh = NA_real_, avesilhlocal = NA_real_)
}
} else {
if (withP) {
c(NNmismatch = NA_real_, avesilh = NA_real_, avesilhlocal = NA_real_,
NNmismatchP = NA_real_, avesilhP = NA_real_, avesilhlocalP = NA_real_)
} else {
c(NNmismatch = NA_real_, avesilh = NA_real_, avesilhlocal = NA_real_)
}
}
}
```

## Try the countsimQC package in your browser

Any scripts or data that you put into this service are public.

countsimQC documentation built on Feb. 5, 2021, 2:02 a.m.