# noisysensors.R
# Copyright (C) 2022 Geert van Boxtel, <G.J.M.vanBoxtel@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# Version history:
# 20220204 GvB Initial setup v0.3-2
# 20220315 GvB changed round() to mround(), bugfix in
# randsample(), catch errors in RANSAC,
# bugfix cor_dev, corMed, rcorr, ransac_frac
# 20220329 GvB bugfix return variables if ransac not performed
#---------------------------------------------------------------------------------------------------------------------
#' noisysensors
#'
#' Detect noisy or outlier sensors
#'
#' The algorithm uses four primary measures: extreme amplitudes (deviation
#' criterion), unusual high frequency noise (noisiness criterion), lack of
#' correlation with any other sensor (correlation criterion), and lack of
#' predictability by other sensors (predictability criterion), in this order.
#' Several of the criteria use a robust z score, replacing the mean by the
#' median and the standard deviation by the robust standard deviation (0.7413
#' times the interquartile range). The algorithm also detects sensors that
#' contain any NAs that have significant periods with constant values or very
#' small values.
#'
#' The **deviation criterion** calculates the robust z-score of the
#' robust standard deviation for each sensor. Sensors designated as bad have
#' a robust z-score greater than \code{devthr} (default 5).
#'
#' The **noisiness criterion** of signal quality uses a robust estimate of
#' the ratio of the power of the high frequency components to the power in the
#' low frequency components. A 50 Hz low pass FIR filter is used to separate the
#' low and high frequency components. Noisiness is defined as the ratio of the
#' median absolute deviation of the high frequency component over the low
#' frequency component for each sensor. A z-score relative to all of the sensors
#' is computed and sensors with a z-score greater than a threshold
#' (\code{HFnoisethr}; default: 5) are marked as bad.
#'
#' The **correlation criterion** is based on the observation that the low
#' frequency portion of EEG signals is somewhat correlated (but not too
#' correlated) among channels. Using signals low- pass filtered at 50 Hz, the
#' algorithm calculates the correlation of each sensor with the other channels
#' in small, non-overlapping time windows (\code{corsecs} parameter; 1 s by
#' default). The maximum absolute correlation is calculated as the 98th
#' percentile of the absolute values of the correlations with the other sensors
#' in each window. If this maximum correlation is less than a threshold
#' (\code{corthr}; 0.4 by default) for a certain percentage of the windows
#' (\code{timethr}; 1% by default), then the sensor is classified as bad.
#'
#' The **predictability criterion** also relies on correlations of the low
#' frequency portion of the signals. The RANSAC (random sample consensus) method
#' of Fischler and Bolles (1981) is used to select a random subset of (so far)
#' good sensors to predict the behavior of each sensor in small non-overlapping
#' time windows (\code{rwin}, 5 seconds by default). A random subset of
#' predictor sensors is chosen for each sensor (\code{sfrac}; 25% by default).
#' The RANSAC algorithm uses a method of spherical splines for estimating scalp
#' potentials. Sensors which have a correlation less than a threshold
#' (\code{ransacthr}; 0.75 by default) with their RANSAC-predicted time courses
#' on more than a certain fraction (\code{ubtime}; 0.4 by default) of the
#' windows are flagged as bad.
#'
#' @param x input data, specified as a numeric matrix or vector. In case
#' of a vector it represents a single signal; in case of a matrix each column
#' is a signal. Alternatively, an object of class \code{\link{ctd}}. The data
#' is assumed to be high-pass filtered.
#' @param fs sampling frequency of \code{x} in Hz. Default: 1. Overruled if
#' \code{x} is a \code{ctd} object, in which case the sampling frequency is
#' \code{fs(x)}. .
#' @param sensors sensors to detect, specified as positive integers indicating
#' sensors numbers, or as sensor names (colnames of \code{x}). Default: all
#' sensors names in \code{x} excluding a variable \code{t} (if present).
#' @param devthr robust deviation threshold, specified as a numeric value.
#' Sensors with robust z-score greater than this value are marked as bad.
#' Default: 5
#' @param HFnoisethr high-frequency noise threshold, specified as a numeric
#' value. Sensors with a robust z-score calculated from the ratio between
#' high- and low-frequency activity are marked as bad. Default: 5
#' @param corsecs segment length in seconds for computing correlations between
#' sensors. The data is segmented in non-overlapping segments of this duration
#' before calculating correlations. Default: 1 second
#' @param corthr correlation threshold, specified as a numeric value. Sensors
#' which have a correlation with other channels lower than this value, for a
#' proportion of the data specified by \code{timethr}, are flagged as
#' bad. Default: 0.4
#' @param timethr time threshold, specified as a numeric value. Proportion of
#' segments during which the correlation is allowed to be lower than
#' \code{corthr} before the sensor is marked as bad. Default: 0.01
#' @param ransac logical indicating whether to perform RANSAC (random sample
#' consensus). Default: TRUE
#' @param slocs sensor locations, needed for RANSAC, specified as a data frame
#' according to \code{\link{sensorlocs}} format. Default: obtained from the
#' sensor names given by the column names of \code{x}
#' @param sfrac fraction of sensors for robust reconstruction. Default: 0.25
#' @param ubtime RANSAC unbroken time - cutoff fraction of time a sensor can
#' have poor RANSAC predictability. Default: 0.4
#' @param ransacthr RANSAC threshold. Sensors which with a correlation less
#' this value with their RANSAC-predicted time courses on more than
#' \code{ubtime} proportion of the windows are flagged as bad. Default: 0.75
#' @param rwin RANSAC window in seconds. Default: 5
#' @param rss RANSAC sample size. Default: 50
#'
#' @return A list with the following fields:
#' \describe{
#' \item{BadFromNA}{**Unusable**: sensors containing NA values}
#' \item{BadFromConstant}{**Unusable**: sensors containing constant values
#' (usually 0)}
#' \item{devthr}{**Deviation criterion**: sensors having a robust z-score of
#' the robust standard deviation greater than this value are classified as
#' bad}
#' \item{dev}{**Deviation criterion**: robust standard deviation for each
#' sensor}
#' \item{devSD}{**Deviation criterion**: robust SD of deviation scores}
#' \item{devMed}{**Deviation criterion**: median of deviation scores}
#' \item{Deviation}{**Deviation criterion**: robust z-scores of deviation
#' scores}
#' \item{BadFromDeviation}{**Deviation criterion**: sensors marked as bad by
#' the deviation criterion}
#' \item{HFnoisethr}{**Noisiness criterion**: sensors having a robust
#' noisiness z-score greater than this value are classified as bad}
#' \item{HFnoise}{**Noisiness criterion**: ratio of high over low frequency
#' noise (per median absolute deviation) for each sensor}
#' \item{medHFnoise}{**Noisiness criterion**: median of HFnoise values}
#' \item{sdHFnoise}{**Noisiness criterion**: robust SD of HFnoise values}
#' \item{zHFnoise}{**Noisiness criterion**: robust z-scores of HFnoise values}
#' \item{BadFromHFnoise}{**Noisiness criterion**: sensors marked as bad by the
#' noisiness criterion}
#' \item{corsecs}{**Correlation criterion**: for the correlation criterion,
#' each sensor is segmented into non-overlapping periods of this length
#' (seconds)}
#' \item{corthr}{**Correlation criterion**: sensors with a correlation less
#' than this value are marked as bad}
#' \item{timethr}{**Correlation criterion**: percentage of segments of a
#' sensor for which the correlation is allowed to be less than the threshold}
#' \item{cors}{**Correlation criterion**: matrix of size [number of segments,
#' number of sensors] containing, for each segment, the maximum absolute
#' correlation (98% percentile of the absolute value of a sensor's correlation
#' with the other sensors)}
#' \item{cor_noise}{**Correlation criterion**: matrix of size [number of
#' segments, number of sensors] containing, for each segment and sensor, a
#' value of the noise level, defined as the ratio of the difference between
#' the original and the 50 Hz low-pass filtered data}
#' \item{cor_dev}{**Correlation criterion**: robust deviation scores for
#' cor_noise}
#' \item{corMed}{**Correlation criterion**: median of correlations per sensor}
#' \item{BadFromCorrelation}{**Correlation criterion**: sensors marked as bad
#' by the correlation criterion}
#' \item{BadFromDropout}{**Correlation criterion**: sensors marked as bad by
#' the cor_noise noisiness criterion (dropouts)}
#' \item{ransacPerformed}{**Predictability criterion**: logical indicating
#' whether RANSAC was performed}
#' \item{ransacSensors}{**Predictability criterion**: sensors on which RANSAC
#' was performed}
#' \item{sfrac}{**Predictability criterion**: fraction of sensors on which
#' RANSAC prediction was based}
#' \item{ubtime}{**Predictability criterion**: RANSAC unbroken time - cutoff
#' fraction of time a sensor can have poor RANSAC predictability}
#' \item{ransacthr}{**Predictability criterion**: RANSAC threshold. Sensors
#' which with a correlation less this value with their RANSAC-predicted time
#' courses on more than ubtime proportion of the windows are flagged as bad}
#' \item{rwin}{**Predictability criterion**: RANSAC window length in seconds}
#' \item{rss}{**Predictability criterion**: RANSAC sample size}
#' \item{rcorr}{**Predictability criterion**: matrix of size [number of
#' sensors, number of windows] containing, for each segment (window) and
#' sensor, the RANSAC correlations}
#' \item{ransac_frac}{**Predictability criterion**: fraction of bad RANSAC
#' windows}
#' \item{BadFromRansac}{**Predictability criterion**: sensors marked as bad
#' from the predictability criterion}
#' \item{TotalBad}{**Summary**: indices of all bad sensors}
#' \item{BadNames}{**Summary**: names of bad sensors}
#' }
#'
#' @examples
#' \dontrun{
#' noisy <- noisysensors(EEGdata)
#' }
#'
#' @author Methods 1 and 4 are adapted from code by Christian Kothe and Methods
#' 2 and 3 are adapted from code by Nima Bigdely-Shamlo; ported to R and
#' adapted by Geert van Boxtel, \email{G.J.M.vanBoxtel@@gmail.com},
#'
#' @references Bigdely-Shamlo, N., Mullen, T.,, Kothe, C.,, Su, K.-M., and
#' Robbins, K. A. (2015). The PREP pipeline: standardized preprocessing for
#' large-scale EEG analysis. Frontiers in Neuroinformatics, 9, Article 16,
#' \url{https://www.frontiersin.org/article/10.3389/fninf.2015.00016}.
#' @references Fischler, M.A., and Bolles, R.C. (1981). Random sample consensus:
#' A paradigm for model fitting with apphcatlons to image analysis and
#' automated cartography. Communications of the ACM, 24(6)), 381-395.
#'
#' @export
noisysensors <- function(x, fs = 1, sensors = setdiff(colnames(x), "t"),
devthr = 5,
HFnoisethr = 5,
corsecs = 1, corthr = 0.4, timethr = 0.01,
ransac = TRUE, slocs = eegr::getLocationsfromLabels(colnames(x)[sensors]),
sfrac = 0.25, ubtime = 0.4, ransacthr = 0.75, rwin = 5, rss = 50) {
if ("ctd" %in% class(x)) {
fs <- fs(x)
} else {
if (!(is.numeric(fs) && length(fs) == 1 && fs > 0)) {
stop("Sampling frequency (fs) must be a positive numeric value")
}
}
if (is.vector(x)) {
cn <- names(x)
x <- matrix(x, ncol = 1)
vec <- TRUE
} else {
cn <- colnames(x)
vec <- FALSE
}
npts <- nrow(x)
ns <- ncol(x)
# work with sensor numbers rather than names
if (is.character(sensors)) {
sensors <- sort(which(sensors %in% colnames(x)))
}
if (!is.numeric(sensors)) {
stop("sensors must be specified as names or numbers")
}
if (!is.numeric(devthr) || length(devthr) != 1L) {
stop('deviation threshold must be a numeric scalar value')
} else {
devthr <- abs(devthr)
}
if (!is.numeric(HFnoisethr) || length(HFnoisethr) != 1L) {
stop('high-frequency noise threshold must be a numeric scalar value')
}
if (!is.numeric(corsecs) || length(corsecs) != 1L || corsecs <= 0) {
stop('high-frequency noise threshold must be a positive numeric scalar value')
}
if (!is.numeric(corthr) || length(corthr) != 1L) {
stop('high-frequency noise threshold must be a numeric scalar value')
}
if (!is.numeric(timethr) || length(timethr) != 1L || timethr <= 0) {
stop('time threshold must be a positive numeric scalar value')
}
if (!is.logical(ransac) || length(ransac) != 1L) {
stop('ransac must be a logical scalar value')
}
if (ransac) {
if (!is.numeric(sfrac) || length(sfrac) != 1L || sfrac <= 0) {
stop('time threshold must be a positive numeric scalar value')
}
if (!is.numeric(ubtime) || length(ubtime) != 1L || ubtime <= 0) {
stop('time threshold must be a positive numeric scalar value')
}
if (!is.numeric(ransacthr) || length(ransacthr) != 1L) {
stop('ransac threshold must be a numeric scalar value')
}
if (!is.numeric(rwin) || length(rwin) != 1L || rwin <= 0) {
stop('ransac window must be a positive numeric scalar value')
}
}
# Detect unusable sensors
orig_sensors <- sensors
bad <- unusablesensors(x[, sensors])
if (bad$n > 0) {
sensors[which(sensors %in% bad$unusable)] <- NA
}
good <- which(!is.na(sensors))
#############################################################################
# Method 1: Unusually high or low amplitude (using robust std)
dev <- apply(x[, good], 2, function(x) 0.7413 * stats::IQR(x)) # Robust estimate of SD
devSD <- 0.7413 * stats::IQR(dev)
devMed <- stats::median(dev, na.rm = TRUE)
Deviation <- rep(NA, ns)
names(Deviation) <- cn
Deviation[good] <- (dev - devMed) / devSD
noisyDev <- which(!is.na(Deviation) & abs(Deviation) > devthr)
#############################################################################
# Method 2: Compute the SNR (based on Christian Kothe's clean_channels)
# Note: RANSAC uses the filtered values X of the data
HFnoise <- zHFnoise <- rep(NA, ns)
names(zHFnoise) <- cn
if (fs > 100) {
# Remove signal content above 50Hz (assume HP filtering already done)
nyquist <- fs / 2
h <- gsignal::fir2(100, c(0, 45, 50, nyquist) / nyquist, c(1, 1, 0, 0))
X <- apply(x, 2, function(x, h) gsignal::filtfilt(h, x), h)
HFnoise <- apply(x[, good], 2, function(x, X, good)
stats::mad(x - X[, good]) / stats::mad(X[, good]), X, good)
medHFnoise <- stats::median(HFnoise, na.rm = TRUE)
sdHFnoise <- stats::mad(HFnoise) * 1.4826
zHFnoise[good] <- (HFnoise - medHFnoise) / sdHFnoise
noisyHF <- which(!is.na(zHFnoise) & zHFnoise > HFnoisethr)
} else {
X = x
medHFnoise <- 0
sdHFnoise <- 1
zHFnoise[sensors] <- rep(0, length(sensors))
names(zHFnoise) <- cn
noisyHF <- integer(0)
}
#############################################################################
# Method 3: Global correlation criteria (from Nima Bigdely-Shamlo)
cor_seglen <- corsecs * fs
cor_offsets <- seq(1, (npts - cor_seglen), cor_seglen)
nwin <- length(cor_offsets)
cors <- cor_noise <- cor_dev <- matrix(NA, nwin, ns)
colnames(cors) <- colnames(cor_noise) <- colnames(cor_dev) <- cn
corMed <- frac_cor <- frac_do <- rep(NA, ns)
names(corMed) <- names(frac_cor) <- names(frac_do) <- cn
for (w in seq_len(nwin)) {
o <- cor_offsets[w]
X_seg <- X[o:(o + cor_seglen - 1), good] #low-pass filtered data
x_seg <- x[o:(o + cor_seglen - 1), good] #original data
if (all(X_seg == x_seg)) {
warning("HF and LF data are identical")
} else {
wincor <- suppressWarnings(stats::cor(X_seg)) #correlation matrix
abscor <- abs(wincor - diag(diag(wincor))) #diagonal always contains 1s
cors[w, good] <- apply(abscor, 2, stats::quantile, 0.98, na.rm = TRUE)
cor_noise[w, good] <- apply(x_seg, 2, function(x, X)
stats::mad(x - X) / stats::mad(X), X_seg)
cor_dev[w, good] <- apply(x_seg, 2, function(x) 0.7413 * stats::IQR(x))
}
}
dropouts <- is.na(cors[, good]) | is.na(cor_noise[, good])
thr_cors <- cors[, good] < corthr
frac_cor[good] <- apply(thr_cors, 2, mean)
frac_do[good] <- apply(dropouts, 2, mean)
noisyCor <- which(frac_cor > timethr)
noisyDO <- which(frac_do > timethr)
med <- apply(cors[, good], 2, stats::median)
corMed[good] <- med
#############################################################################
# Bad so far by amplitude and correlation (take these out before doing ransac)
noisy = union(noisyDev, union(noisyHF, union(noisyCor, noisyDO)))
#############################################################################
# Method 4: Ransac corelation (may not be performed)
# Setup for ransac (if a 2-stage algorithm, remove other bad channels first)
ransac_frac <- 0
ransacPerformed <- TRUE
if (!ransac) {
ransacPerformed <- FALSE
} else if (is.null(slocs) || nrow(slocs) <= 0) {
warning("ransac could not be computed because there were no sensor locations")
ransacPerformed <- FALSE
} else if (!any(c("x", "y", "z") %in% colnames(slocs))) {
warning("slocs must contain valid x, y and z variables")
ransacPerformed <- FALSE
} else {
rsens <- setdiff(good, noisy)
X <- X[, rsens]
ransacSS <- mround(sfrac * ns) #rounding is different in Matlab and R
if (ubtime < 1) {
ubframes <- npts * ubtime
} else {
ubframes = fs * ubtime
}
rFrames <- rwin * fs
rWindow <- seq(0, rFrames - 1)
rOffsets <- seq(1, max(1, (npts - rFrames)), rFrames)
WRansac <- length(rOffsets)
nlocs <- nrow(slocs)
if (nlocs < 3 || length(rsens) < 2) {
warning("too many channels have failed quality tests to perform ransac")
ransacPerformed <- FALSE
}
}
noisyRansac <- integer(0)
if (ransacPerformed) {
err <- try({
locs <- slocs[rsens, c("x", "y", "z")]
P <- calc_projector(locs, rss, ransacSS)
corrT <- matrix(0, nrow(locs), WRansac)
# Calculate each channel's correlation to its RANSAC reconstruction for each window
n <- length(rWindow)
m <- length(rsens)
p <- rss
Xwin <- X[1:(n*WRansac), ]
dim(Xwin) <- c(m, n, WRansac)
for (k in seq_len(WRansac)) {
corrT[, k] <- calculateRansacWindow(t(Xwin[, , k]), P, n, m, p)
}
flagged <- corrT < ransacthr
if (length(flagged) > 0) {
noisyRansac = which(sum(flagged) * rFrames > ubframes)
ransac_frac = apply(flagged, 1, sum) / length(flagged)
names(ransac_frac) <- colnames(x)[rsens]
} else {
noisyRansac <- integer(0)
ransac_frac <- NA
}
}, silent = TRUE)
if ("try-error" %in% class(err)) {
ransacPerformed <- FALSE
}
}
# either not performed or exited with error
if (!ransacPerformed) {
rsens <- sfrac <- ubtime <- ransacthr <- rwin <- rss <- corrT <- ransac_frac <- NA
noisyRansac <- integer(0)
}
# Combine bad channels detected from all methods
noisy = sort(union(bad$unusable, union(noisy, noisyRansac)))
# Set up return values
rv <- list(
# unusable
BadFromNA = bad$sensor_NA,
BadFromConstant = bad$sensor_constant,
# 1: deviation
devthr = devthr,
dev = dev,
devSD = devSD,
devMed = devMed,
Deviation = Deviation[sensors],
BadFromDeviation = noisyDev,
# 2: High Frequency Noise
HFnoisethr = HFnoisethr,
HFnoise = HFnoise,
medHFnoise = medHFnoise,
sdHFnoise = sdHFnoise,
zHFnoise = zHFnoise[sensors],
BadFromHFnoise = noisyHF,
# 3: correlation
corsecs = corsecs,
corthr = corthr,
timethr = timethr,
cors = cors[, sensors],
cor_noise = cor_noise[, sensors],
cor_dev = cor_dev[, sensors],
corMed = corMed[sensors],
BadFromCorrelation = noisyCor,
BadFromDropout = noisyDO,
# 4: RANSAC
ransacPerformed = ransacPerformed,
ransacSensors = rsens,
sfrac = sfrac,
ubtime = ubtime,
ransacthr = ransacthr,
rwin = rwin,
rss = rss,
rcorr = corrT,
ransac_frac = ransac_frac,
BadFromRansac = noisyRansac,
# summary
TotalBad = noisy,
BadNames = cn[noisy]
)
rv
}
# helper functions for RANSAC
calc_projector <- function(locs, numberSamples, subsetSize) {
# Calculate a bag of reconstruction matrices from random channel subsets
ps <- getRandomSubsets(locs, subsetSize, numberSamples)
permutedLocations <- ps$permutedLocations
subsets <- ps$subsets
randomSamples = vector("list", numberSamples)
for (k in seq_len(numberSamples)) {
tmp <- matrix(0, nrow(locs), nrow(locs))
slice <- subsets[k, ]
tmp[slice, ] <- Re(ssinterp(t(permutedLocations[, , k]), locs))
randomSamples[[k]] <- tmp
}
P = do.call(cbind, randomSamples)
P
}
getRandomSubsets <- function(locs, subsetSize, numberSamples) {
#stream = RandStream('mt19937ar', 'Seed', 435656);
numberChannels <- nrow(locs)
permutedLocations <- array(0, dim = c(3, subsetSize, numberSamples))
subsets = matrix(0, numberSamples, subsetSize)
for (k in seq_len(numberSamples)) {
subset <- randsample(seq_len(numberChannels), subsetSize)
subsets[k, ] <- subset
permutedLocations[1:3, 1:subsetSize, k] <- t(locs[subset, ])
}
list(permutedLocations = permutedLocations, subsets = subsets)
}
randsample <- function (X, num) {
Y <- rep(0, num)
for (k in seq_len(num)) {
# make sure pick > 0
while (TRUE) {
pick <- mround(1 + (length(X) - 1) * stats::runif(1))
if (pick > 0) break
}
Y[k] <- X[pick]
X <- X[-pick]
}
Y
}
calculateRansacWindow <- function(XX, P, n, m, p) {
YY <- XX %*% P
dim(YY) <- c(n, m, p)
YY = aperm(apply(YY, 1:2, sort), c(2,3,1))
YY = YY[, , mround(dim(YY)[3] / 2)]
rX = apply(XX * YY, 2, sum) /
(sqrt(apply(XX^2, 2, sum)) * sqrt(apply(YY^2, 2, sum)))
rX
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.