R/kolmogorov.R
In matrixTests: Fast Statistical Hypothesis Tests on Rows and Columns of Matrices

Documented in col_kolmogorovsmirnov_twosample row_kolmogorovsmirnov_twosample

#' Kolmogorov-Smirnov test
#'
#' Performs a Kolmogorov-Smirnov test on each row/column of the input matrix.
#'
#' Function to perform two sample Kolmogorov-Smirnov test on rows/columns of
#' matrices. Main arguments and results were intentionally matched to the
#' \code{ks.test()} function from default stats package.
#'
#' Results should be the same as running \code{ks.test(x, y)} on every row (or
#' column) of \code{x} and \code{y}.
#'
#' By default if 'exact' argument is set to 'NA', exact p-values are computed
#' if the product of 'x' and 'y' sample sizes is less than 10000. Otherwise,
#' asymptotic distributions are used.
#'
#' Alternative hypothesis setting specifies null and alternative hypotheses.
#' The possible values of 'two sided', 'less', and 'greater'.
#' 'two sided' sets the null hypothesis for the distributions of 'x' being equal to the distribution 'y'.
#' 'less' sets the null hypothesis for the distribution of x not being less than the distribution of y.
#' 'greater' sets the null hypothesis for the distribution of x not being greater than the distribution of y.
#' See \code{help(ks.test)} for more details.
#'
#' @param x numeric matrix.
#' @param y numeric matrix for the second group of observations.
#' @param alternative alternative hypothesis to use for each row/column of x.
#' A single string or a vector with values for each observation.
#' Values must be one of "two.sided" (default), "greater" or "less".
#' @param exact logical or NA (default) indicator whether an exact p-value should be computed (see Details).
#' A single value or a logical vector with values for each observation.
#'
#' @return a data.frame where each row contains the results of a Kolmogorov-Smirnov test
#' performed on the corresponding row/column of x and y.
#' Each row contains the following information (in order):\cr
#' 1. obs.x - number of x observations\cr
#' 2. obs.y - number of y observations\cr
#' 3. obs.tot - total number of observations\cr
#' 5. statistic - Wilcoxon test statistic\cr
#' 6. pvalue - p-value\cr
#' 8. alternative - chosen alternative hypothesis\cr
#' 9. exact - indicates if exact p-value was computed\cr
#'
#' @seealso \code{ks.test()}
#'
#' @examples
#' X <- iris[iris$Species=="setosa", 1:4]
#' Y <- iris[iris$Species=="virginica", 1:4]
#' col_kolmogorovsmirnov_twosample(X, Y)
#'
#' # same column using different alternative hypotheses
#' col_kolmogorovsmirnov_twosample(X[,c(1,1,1)], Y[,c(1,1,1)], alternative=c("t", "g", "l"))
#'
#' @author Karolis Koncevičius
#' @name kolmogorov
#' @export
row_kolmogorovsmirnov_twosample <- function(x, y, alternative="two.sided", exact=NA) {
  is.null(x)
  is.null(y)

  if (is.vector(x))
    x <- matrix(x, nrow=1)
  if(is.vector(y))
    y <- matrix(y, nrow=1)

  if (is.data.frame(x) && all(sapply(x, is.numeric)))
    x <- data.matrix(x)
  if(is.data.frame(y) && all(sapply(y, is.numeric)))
    y <- data.matrix(y)

  assert_numeric_mat_or_vec(x)
  assert_numeric_mat_or_vec(y)

  if(nrow(y)==1 & nrow(x)>1) {
    y <- matrix(y, nrow=nrow(x), ncol=ncol(y), byrow=TRUE)
  }

  assert_equal_nrow(x, y)

  if(length(alternative)==1)
    alternative <- rep.int(alternative, nrow(x))
  assert_character_vec_length(alternative, 1, nrow(x))

  choices <- c("two.sided", "less", "greater")
  alternative <- choices[pmatch(alternative, choices, duplicates.ok=TRUE)]
  assert_all_in_set(alternative, choices)

  if(length(exact)==1)
    exact <- rep.int(exact, nrow(x))
  assert_logical_vec_length(exact, 1, nrow(x))
  assert_all_in_set(exact, c(TRUE, FALSE, NA))


  nxs <- as.numeric(ncol(x) - matrixStats::rowCounts(x, value=NA))
  nys <- as.numeric(ncol(y) - matrixStats::rowCounts(y, value=NA))

  naexact <- is.na(exact)
  exact[naexact] <- (nxs[naexact] * nys[naexact]) < 10000

  w <- cbind(x, y)
  z <- col(w)
  z[is.na(w)] <- NA

  # order w and z by row
  ord <- order(row(w), w)
  w   <- matrix(w[ord], nrow=nrow(w), byrow=TRUE)
  z   <- matrix(z[ord], nrow=nrow(w), byrow=TRUE)

  inds <- z <= ncol(x)
  z[which(inds)]  <- rep(1/nxs, ncol(inds))[which(inds)]    # replace values by row
  z[which(!inds)] <- rep(-1/nys, ncol(inds))[which(!inds)]  # replace values by row
  # z <- matrixStats::rowCumsums(z)  # NOTE: precision problems which cause bugs
  # TODO: improve this
  z <- t(apply(z, 1, cumsum))

  w[is.nan(w)] <- NA  # so that NaNs will not show ties error
  diffs   <- matrixStats::rowDiffs(w)
  ties    <- cbind(diffs == 0 | is.nan(diffs), rep(FALSE, nrow(diffs)))  # NOTE: is.nan needed to show ties with infinities
  hasties <- matrixStats::rowAnys(ties)
  z[which(ties)] <- NA

  statistic <- rep.int(NA_real_, nrow(x))
  p         <- rep.int(NA_real_, nrow(x))

  inds <- alternative == "two.sided" & nxs > 0 & nys > 0  # NOTE: because psmirnove gives an error otherwise
  if(any(inds)) {
    statistic[inds] <- matrixStats::rowMaxs(abs(z[inds,,drop=FALSE]), na.rm=TRUE)
    # NOTE: this is a fix for a change of argument names in psmirnov() @ R 4.4.0
    #       it is repeated across all 3 cases below
    #       in the future when I have more time for this - good idea to change
    #       one possibility is to create a wrapper for stats::psmirnov that does this automatically
    if(!"two.sided" %in% names(formals(stats::psmirnov))) {
      p[inds] <- mapply(stats::psmirnov, statistic[inds], asplit(cbind(nxs[inds], nys[inds]), 1),
                        asplit(w[inds,,drop=FALSE], 1), exact[inds], alternative = "two.sided", lower.tail = FALSE)
    } else {
      p[inds] <- mapply(stats::psmirnov, statistic[inds], asplit(cbind(nxs[inds], nys[inds]), 1),
                        asplit(w[inds,,drop=FALSE], 1), exact[inds], two.sided = TRUE, lower.tail = FALSE)
    }
  }

  inds <- alternative == "greater" & nxs > 0 & nys > 0
  if(any(inds)) {
    statistic[inds] <- matrixStats::rowMaxs(z[inds,,drop=FALSE], na.rm=TRUE)
    if(!"two.sided" %in% names(formals(stats::psmirnov))) {
      p[inds] <- mapply(stats::psmirnov, statistic[inds], asplit(cbind(nxs[inds], nys[inds]), 1),
                        asplit(w[inds,,drop=FALSE], 1), exact[inds], alternative = "greater", lower.tail = FALSE)
    } else {
      p[inds] <- mapply(stats::psmirnov, statistic[inds], asplit(cbind(nxs[inds], nys[inds]), 1),
                        asplit(w[inds,,drop=FALSE], 1), exact[inds], two.sided = FALSE, lower.tail = FALSE)
    }
  }

  inds <- alternative == "less" & nxs > 0 & nys > 0
  if(any(inds)) {
    statistic[inds] <- -matrixStats::rowMins(z[inds,,drop=FALSE], na.rm=TRUE)
    if(!"two.sided" %in% names(formals(stats::psmirnov))) {
      p[inds] <- mapply(stats::psmirnov, statistic[inds], asplit(cbind(nxs[inds], nys[inds]), 1),
                        asplit(w[inds,,drop=FALSE], 1), exact[inds], alternative = "less", lower.tail = FALSE)
    } else {
      p[inds] <- mapply(stats::psmirnov, statistic[inds], asplit(cbind(nxs[inds], nys[inds]), 1),
                        asplit(w[inds,,drop=FALSE], 1), exact[inds], two.sided = FALSE, lower.tail = FALSE)
    }
  }

  # NOTE: not sure if necessary
  # p <- pmin(1, pmax(0, p))


  w1 <- nxs < 1
  showWarning(w1, 'kolmogorovsmirnov_twosample', 'had less than 1 "x" observation')

  w2 <- nys < 1
  showWarning(w2, 'kolmogorovsmirnov_twosample', 'had less than 1 "y" observation')

  w3 <- !exact & hasties
  showWarning(w3, 'kolmogorovsmirnov_twosample', 'had ties: asymptotic p-values will be approximate')

  statistic[w1 | w2] <- NA
  p[w1 | w2]         <- NA


  rnames <- rownames(x)
  if(!is.null(rnames)) rnames <- make.unique(rnames)
  data.frame(obs.x=nxs, obs.y=nys, obs.tot=nxs+nys, statistic=statistic,
             pvalue=p, alternative=alternative, exact=exact,
             stringsAsFactors=FALSE, row.names=rnames
             )
}

#' @rdname kolmogorov
#' @export
col_kolmogorovsmirnov_twosample <- function(x, y, alternative="two.sided", exact=NA) {
  row_kolmogorovsmirnov_twosample(t(x), t(y), alternative=alternative, exact=exact)
}

Any scripts or data that you put into this service are public.

matrixTests documentation built on Oct. 6, 2023, 1:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

matrixTests
Fast Statistical Hypothesis Tests on Rows and Columns of Matrices

R/kolmogorov.R
In matrixTests: Fast Statistical Hypothesis Tests on Rows and Columns of Matrices

Defines functions col_kolmogorovsmirnov_twosample row_kolmogorovsmirnov_twosample

Documented in col_kolmogorovsmirnov_twosample row_kolmogorovsmirnov_twosample

Try the matrixTests package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

matrixTests Fast Statistical Hypothesis Tests on Rows and Columns of Matrices

R/kolmogorov.R In matrixTests: Fast Statistical Hypothesis Tests on Rows and Columns of Matrices

Defines functions col_kolmogorovsmirnov_twosample row_kolmogorovsmirnov_twosample

Documented in col_kolmogorovsmirnov_twosample row_kolmogorovsmirnov_twosample

Try the matrixTests package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

matrixTests
Fast Statistical Hypothesis Tests on Rows and Columns of Matrices

R/kolmogorov.R
In matrixTests: Fast Statistical Hypothesis Tests on Rows and Columns of Matrices