Nothing
data(sysdata, envir=environment())
horn.outliers = function (data)
{
# This function implements Horn's algorithm for outlier detection using
# Tukey's interquartile fences.
b = MASS::boxcox(lm(data ~ 1, y = TRUE), plotit=FALSE);
lambda = b$x[which.max(b$y)];
if (lambda == 0) {
transData = log(data);
}
else {
transData = ((data ^ lambda) - 1) / lambda;
}
descriptives = summary(transData);
Q1 = descriptives[[2]];
Q3 = descriptives[[5]];
IQR = Q3 - Q1;
out = transData[transData <= (Q1 - 1.5 * IQR) | transData >= (Q3 + 1.5 * IQR)];
sub = transData[transData > (Q1 - 1.5 * IQR) & transData < (Q3 + 1.5 * IQR)];
if (lambda == 0) {
return (list(outliers = (exp(1) ^ out), subset = (exp(1) ^ sub)));
}
else {
return (list(outliers = ((out*lambda) + 1)^(1/lambda), subset = ((sub*lambda) + 1)^(1/lambda)));
}
}
dixon.outliers = function (data)
{
# This outlier detection method implements Dixon and Dean's algorithm to find
# only a single outlier, if it exists.
d = sort(data);
dixResult = outliers::dixon.test(data);
pResult = dixResult[[3]];
result = strsplit(dixResult[[2]], " ");
if(pResult <= 0.05) {
out = result[[1]][3];
if(result[[1]][1] == "highest") {
sub = data[data < d[length(d)]];
}
else {
sub = data[data > d[1]];
}
}
else {
out = as.numeric(c());
sub = data;
}
return(list(outliers = out, subset = sub));
}
cook.outliers = function (data)
{
# This function identifies outliers based on their Cook Distance from a fitted
# regression. In this case, the regression is truly just the mean.
fit = lm(data ~ 1);
cooks_dist = cooks.distance(fit);
out = data[as.numeric(names(cooks_dist[cooks_dist > (4/length(cooks_dist)) |
cooks_dist > 1]))];
sub = data[as.numeric(names(cooks_dist[cooks_dist <= (4/length(cooks_dist)) &
cooks_dist <= 1]))];
return(list(outliers = out, subset = sub));
}
vanderLoo.outliers = function (data)
{
# This function identifies outliers using Mark van der Loo's Method I algorithm for
# outlier detection in the extremevalues package.
result = extremevalues::getOutliers(data, method = "I");
indices = c(result$iLeft, result$iRight);
out = data[indices];
sub = data[!data %in% out];
return(list(outliers = out, subset = sub));
}
robust = function (data, indices = c(1:length(data)), refConf = 0.95)
{
# This implements the robust algorithm as outlined in the CLSI document C28-A3c.
data = sort(data[indices]);
n = length(data);
median = summary(data)[[3]];
Tbi = median;
TbiNew = 10000;
c = 3.7;
MAD = summary(abs(data - median))[[3]];
MAD = MAD / 0.6745;
smallDiff = FALSE;
repeat {
ui = (data - Tbi) / (c * MAD);
ui[ui < -1] = 1;
ui[ui > 1] = 1;
wi = (1 - ui^2)^2;
TbiNew = (sum(data * wi) / sum(wi));
if(!is.finite(TbiNew) | (abs(TbiNew - Tbi)) < 0.000001){
break;
}
Tbi = TbiNew;
};
ui = NULL;
ui = (data - median) / (205.6 * MAD);
sbi205.6 = 205.6 * MAD * sqrt((n * sum(((1-ui[ui>-1 & ui<1]^2)^4)*ui[ui>-1 & ui<1]^2)) /
(sum((1-ui[ui>-1 & ui<1]^2)*(1-5*ui[ui>-1 & ui<1]^2)) *
max(c(1, -1 + sum((1-ui[ui>-1 & ui<1]^2)*(1-5*ui[ui>-1 & ui<1]^2))))));
ui = NULL;
ui = (data - median) / (3.7 * MAD);
sbi3.7 = 3.7 * MAD * sqrt((n * sum(((1-ui[ui>-1 & ui<1]^2)^4)*ui[ui>-1 & ui<1]^2)) /
(sum((1-ui[ui>-1 & ui<1]^2)*(1-5*ui[ui>-1 & ui<1]^2)) *
max(c(1, -1 + sum((1-ui[ui>-1 & ui<1]^2)*(1-5*ui[ui>-1 & ui<1]^2))))));
ui = NULL;
ui = (data - Tbi) / (3.7 * sbi3.7);
St3.7 = 3.7 * sbi3.7 * sqrt((sum(((1-ui[ui>-1 & ui<1]^2)^4)*ui[ui>-1 & ui<1]^2)) /
(sum((1-ui[ui>-1 & ui<1]^2)*(1-5*ui[ui>-1 & ui<1]^2)) *
max(c(1, -1 + sum((1-ui[ui>-1 & ui<1]^2)*(1-5*ui[ui>-1 & ui<1]^2))))));
tStatistic = qt(1 - ((1 - refConf)/2), (n-1));
margin = tStatistic * sqrt(sbi205.6^2 + St3.7^2);
robustLower = Tbi - margin;
robustUpper = Tbi + margin;
RefInterval = c(robustLower, robustUpper);
return (RefInterval);
}
nonparRI = function(data, indices = 1:length(data), refConf = 0.95)
{
# Nonparametric calculation of reference interval, written to be a compatible
# boot statistic function.
d = data[indices];
results = c(quantile(d, (1 - refConf)/2, type = 6), quantile(d, 1-((1 - refConf)/2), type = 6));
return (results);
}
refLimit = function(data, out.method = "horn", out.rm = FALSE, RI = "p", CI = "p",
refConf = 0.95, limitConf = 0.90, bootStat = "basic"){
cl = class(data);
if(cl == "data.frame"){
frameLabels = colnames(data);
dname = deparse(substitute(data));
result = lapply(data, singleRefLimit, dname, out.method, out.rm, RI, CI, refConf, limitConf, bootStat);
for(i in 1:length(data)){
result[[i]]$dname = frameLabels[i];
}
class(result) = "interval";
}
else{
frameLabels = NULL;
dname = deparse(substitute(data));
result = singleRefLimit(data, dname, out.method, out.rm, RI, CI, refConf, limitConf, bootStat);
}
return(result);
}
singleRefLimit = function(data, dname = "default", out.method = "horn", out.rm = FALSE,
RI = "p", CI = "p", refConf = 0.95, limitConf = 0.90, bootStat = "basic")
{
# This function determines a reference interval from a vector of data samples.
# The default is a parametric calculation, but other options include a non-parametric
# calculation of reference interval with bootstrapped confidence intervals around the
# limits, and also the robust algorithm for calculating the reference interval with
# bootstrapped confidence intervals of the limits.
if(out.method == "dixon"){
output = dixon.outliers(data);
}
else if(out.method == "cook"){
output = cook.outliers(data);
}
else if(out.method == "vanderLoo"){
output = vanderLoo.outliers(data);
}
else{
output = horn.outliers(data);
}
if(out.rm == TRUE){
data = output$subset;
}
if(!bootStat %in% c("basic", "norm", "perc", "stud", "bca")) {
bootStat = "basic";
}
outliers = output$outliers;
n = length(data);
mean = mean(data, na.rm = TRUE);
sd = sd(data, na.rm = TRUE);
norm = NULL;
# Calculate a nonparametric reference interval.
if(RI == "n"){
methodRI = "Reference Interval calculated nonparametrically";
data = sort(data);
holder = nonparRI(data, indices = 1:length(data), refConf);
lowerRefLimit = holder[1];
upperRefLimit = holder[2];
if(CI == "p"){
CI = "n";
}
}
# Calculate a reference interval using the robust algorithm method.
if(RI == "r"){
methodRI = "Reference Interval calculated using Robust algorithm";
holder = robust(data, 1:length(data), refConf);
lowerRefLimit = holder[1];
upperRefLimit = holder[2];
CI = "boot";
}
# Calculate a reference interval parametrically, with parametric confidence interval
# around the limits.
if(RI == "p"){
# http://www.statsdirect.com/help/parametric_methods/reference_range.htm
# https://en.wikipedia.org/wiki/Reference_range#Confidence_interval_of_limit
methodRI = "Reference Interval calculated parametrically";
methodCI = "Confidence Intervals calculated parametrically";
refZ = qnorm(1 - ((1 - refConf) / 2));
limitZ = qnorm(1 - ((1 - limitConf) / 2));
lowerRefLimit = mean - refZ * sd;
upperRefLimit = mean + refZ * sd;
se = sqrt(((sd^2)/n) + (((refZ^2)*(sd^2))/(2*n)));
lowerRefLowLimit = lowerRefLimit - limitZ * se;
lowerRefUpperLimit = lowerRefLimit + limitZ * se;
upperRefLowLimit = upperRefLimit - limitZ * se;
upperRefUpperLimit = upperRefLimit + limitZ * se;
shap_normalcy = shapiro.test(data);
shap_output = paste(c("Shapiro-Wilk: W = ", format(shap_normalcy$statistic,
digits = 6), ", p-value = ", format(shap_normalcy$p.value,
digits = 6)), collapse = "");
ks_normalcy = suppressWarnings(ks.test(data, "pnorm", m = mean, sd = sd));
ks_output = paste(c("Kolmorgorov-Smirnov: D = ", format(ks_normalcy$statistic,
digits = 6), ", p-value = ", format(ks_normalcy$p.value,
digits = 6)), collapse = "");
if(shap_normalcy$p.value < 0.05 | ks_normalcy$p.value < 0.05){
norm = list(shap_output, ks_output);
}
else{
norm = list(shap_output, ks_output);
}
}
# Calculate confidence interval around limits nonparametrically.
if(CI == "n"){
if(n < 120){
cat("\nSample size too small for non-parametric confidence intervals,
bootstrapping instead\n");
CI = "boot";
}
else{
methodCI = "Confidence Intervals calculated nonparametrically";
ranks = nonparRanks[which(nonparRanks$SampleSize == n),];
lowerRefLowLimit = data[ranks$Lower];
lowerRefUpperLimit = data[ranks$Upper];
upperRefLowLimit = data[(n+1) - ranks$Upper];
upperRefUpperLimit = data[(n+1) - ranks$Lower];
}
}
# Calculate bootstrapped confidence intervals around limits.
if(CI == "boot" & (RI == "n" | RI == "r")){
methodCI = "Confidence Intervals calculated by bootstrapping, R = 5000";
if(RI == "n"){
bootresult = boot::boot(data = data, statistic = nonparRI, refConf = refConf, R = 5000);
}
if(RI == "r"){
bootresult = boot::boot(data = data, statistic = robust, refConf = refConf, R = 5000);
}
bootresultlower = boot::boot.ci(bootresult, conf = limitConf, type=bootStat, index = c(1,2));
bootresultupper = boot::boot.ci(bootresult, conf = limitConf, type=bootStat, index = c(2,2));
bootresultlength = length(bootresultlower[[4]]);
lowerRefLowLimit = bootresultlower[[4]][bootresultlength - 1];
lowerRefUpperLimit = bootresultlower[[4]][bootresultlength];
upperRefLowLimit = bootresultupper[[4]][bootresultlength - 1];
upperRefUpperLimit = bootresultupper[[4]][bootresultlength];
}
RVAL = list(size = n, dname = dname, out.method = out.method, out.rm = out.rm,
outliers = outliers, methodRI = methodRI, methodCI = methodCI,
norm = norm, refConf = refConf, limitConf = limitConf,
Ref_Int = c(lowerRefLimit = lowerRefLimit, upperRefLimit = upperRefLimit),
Conf_Int = c(lowerRefLowLimit = lowerRefLowLimit,
lowerRefUpperLimit = lowerRefUpperLimit,
upperRefLowLimit = upperRefLowLimit,
upperRefUpperLimit = upperRefUpperLimit));
class(RVAL) = "interval";
return(RVAL);
}
print.interval = function (x, digits = 4L, quote = TRUE, prefix = "", ...)
{
if(inherits(x[[1]], "interval")){
lapply(x, print.interval.sub);
}
else{
print.interval.sub(x);
}
}
print.interval.sub = function (x, digits = 4L, quote = TRUE, prefix = "", ...)
{
cat("\n");
cat(strwrap(x$methodRI, prefix = "\t"), sep = "\n");
cat(strwrap(x$methodCI, prefix = "\t"), sep = "\n");
cat("\n");
cat("data: ", x$dname, "\n", sep = "");
cat("N: ", x$size, "\n", sep = "");
if (!is.null(x$refConf)) {
cat(format(100 * x$refConf), "% Reference Interval", "\n", sep = "");
}
if (!is.null(x$limitConf)) {
cat(format(100 * x$limitConf), "% Confidence Intervals\n", "\n", sep = "");
}
if (!is.null(x$out.method)) {
cat("Outlier detection method:", x$out.method, "\n");
}
if (!is.null(x$outliers) & length(x$outliers) > 0) {
if(x$out.rm){
cat("Removed outliers: ");
}
else{
cat("Suspected outliers: ");
}
cat(strwrap(paste(format(x$outliers, digits = 6), collapse = ", ")), sep = "\n");
}
else {
cat("No outliers detected\n");
}
if (!is.null(x$norm)) {
cat(strwrap(x$norm, prefix = "\n"), "\n\n");
}
if (!is.null(x$Ref_Int)) {
cat("\nReference Interval: ");
cat(strwrap(paste(format(x$Ref_Int, digits = 6), collapse = ", ")), sep = "\n");
}
if (!is.null(x$Conf_Int)) {
cat("Lower Confidence Interval: ");
cat(strwrap(paste(format(x$Conf_Int[1:2], digits = 6), collapse = ", ")), sep = "\n");
cat("Upper Confidence Interval: ");
cat(strwrap(paste(format(x$Conf_Int[3:4], digits = 6), collapse = ", ")), sep = "\n");
}
cat("\n");
invisible(x);
}
plot.interval = function (x, main = NULL, ...)
{
original.parameters = par();
if(!inherits(x[[1]], "interval")){
range = max(x[["Conf_Int"]][[4]]) - min(x[["Conf_Int"]][[1]]);
y_low = min(x[["Conf_Int"]][[1]]) - 0.05 * range;
y_high = max(x[["Conf_Int"]][[4]]) + 0.05 * range;
plot.new();
plot.window(xlim=c(0,2), ylim=c(y_low,y_high));
segments(1, min(x[["Ref_Int"]]), 1, max(x[["Ref_Int"]]), col = "red");
segments(1-0.05, x[["Conf_Int"]][[1]], 1+0.05, x[["Conf_Int"]][[1]], col = "blue");
segments(1-0.05, x[["Conf_Int"]][[2]], 1+0.05, x[["Conf_Int"]][[2]], col = "blue");
segments(1-0.05, x[["Conf_Int"]][[3]], 1+0.05, x[["Conf_Int"]][[3]], col = "blue");
segments(1-0.05, x[["Conf_Int"]][[4]], 1+0.05, x[["Conf_Int"]][[4]], col = "blue");
axis(1, at=1:1, labels=x[["dname"]]);
axis(2);
if(!is.null(main)){
title(main = main);
}
else {
title(main="Reference Range");
}
title(xlab="Parameter");
title(ylab="Units");
legend("topright", col = c("red", "blue"), lty = 1, inset = c(0, -0.09),
legend = c("Reference Interval", "Confidence Interval"), cex = 0.6,
xpd = TRUE);
box();
}
if(inherits(x[[1]], "interval")){
numRanges = length(x);
intervals = unlist(sapply(x, "[", "Conf_Int"));
labels = unlist(sapply(x, "[", "dname"));
range = max(intervals) - min(intervals);
y_low = min(intervals) - 0.05 * range;
y_high = max(intervals) + 0.05 * range;
plot.new();
plot.window(xlim=c(0,numRanges + 1), ylim=c(y_low,y_high));
for(i in 1:numRanges){
segments(i, x[[i]]$Ref_Int[1], i, x[[i]]$Ref_Int[2], col = "red");
segments(i-0.05, x[[i]]$Conf_Int[[1]], i+0.05, x[[i]]$Conf_Int[[1]], col = "blue");
segments(i-0.05, x[[i]]$Conf_Int[[2]], i+0.05, x[[i]]$Conf_Int[[2]], col = "blue");
segments(i-0.05, x[[i]]$Conf_Int[[3]], i+0.05, x[[i]]$Conf_Int[[3]], col = "blue");
segments(i-0.05, x[[i]]$Conf_Int[[4]], i+0.05, x[[i]]$Conf_Int[[4]], col = "blue");
}
axis(1, at=1:numRanges, labels = FALSE);
text(x = seq(1, numRanges, by=1), par("usr")[3] - 0.2, labels = labels,
cex = 0.75, srt = 90, pos = 1, offset = 2, xpd = TRUE);
axis(2);
if(!is.null(main)){
title(main = main);
}
else {
title(main="Reference Range");
}
title(xlab="Parameter");
title(ylab="Units");
legend("topright", col = c("red", "blue"), lty = 1, inset = c(0, -0.09),
legend = c("Reference Interval", "Confidence Interval"), cex = 0.6,
xpd = TRUE);
box();
}
par(original.parameters[-c(13, 19, 21:23, 54)]);
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.