knitr::opts_chunk$set(echo = TRUE)

Get data for range and outliers

This is an R Markdown document. The document takes input from a SHARKweb filtered download of national marine environmental data to calculate and plot range and outlier data.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

Information on outliers

Get your IQR (Interquartile range) and lower/upper quartile using: lowerq = quantile(data)[2] upperq = quantile(data)[4] iqr = upperq - lowerq #Or use IQR(data) Compute the bounds for a mild outlier: mild.threshold.upper = (iqr * 1.5) + upperq mild.threshold.lower = lowerq - (iqr * 1.5) Any data point outside (> mild.threshold.upper or < mild.threshold.lower) these values is a mild outlier To detect extreme outliers do the same, but multiply by 3 instead: extreme.threshold.upper = (iqr * 3) + upperq extreme.threshold.lower = lowerq - (iqr * 3) Any data point outside (> extreme.threshold.upper or < extreme.threshold.lower) these values is an extreme outlier

Tukey's fences (https://en.wikipedia.org/wiki/Outlier#Tukey.27s_test) Other methods flag observations based on measures such as the interquartile range. For example, if {\displaystyle Q_{1}}Q_{1} and {\displaystyle Q_{3}}Q_{3} are the lower and upper quartiles respectively, then one could define an outlier to be any observation outside the range:

{\displaystyle {\big [}Q_{1}-k(Q_{3}-Q_{1}),Q_{3}+k(Q_{3}-Q_{1}){\big ]}}{\big [}Q_{1}-k(Q_{3}-Q_{1}),Q_{3}+k(Q_{3}-Q_{1}){\big ]} for some nonnegative constant {\displaystyle k}k. John Tukey proposed this test, where {\displaystyle k=1.5}k=1.5 indicates an "outlier", and {\displaystyle k=3}k=3 indicates data that is "far out".[16]

library(tidyverse)
setwd("C:/__R/_test_SHARK4R/")
readfile = read_delim("sharkweb_data_20201029.txt", delim = "\t", guess_max = 2000, col_names = T, locale = readr::locale(encoding = "latin1", decimal_mark = ","))

colnames(readfile)
length(unique(readfile$Parameter))

unique(readfile$Datatyp)
readfile = readfile %>% 
  mutate_all(type.convert) %>%
    mutate_if(is.factor, as.character)

unique(readfile$Parameter)

readfile = readfile %>% 
  drop_na("Mätvärde")
datatype = readfile %>% 
  filter(Datatyp == "Bacterioplankton") %>% 
  select(Parameter, Mätvärde)
message("Bacterioplankton")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Chlorophyll") %>% 
  select(Parameter, Mätvärde)
message("Chlorophyll")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Picoplankton") %>% 
  select(Parameter, Mätvärde)
message("Picoplankton")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Zooplankton") %>% 
  select(Parameter, Mätvärde)
message("Zooplankton")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Phytoplankton") %>% 
  select(Parameter, Mätvärde)
message("Phytoplankton")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Primary production") %>% 
  select(Parameter, Mätvärde)
message("Primary production")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Seal pathology") %>% 
  select(Parameter, Mätvärde)
message("Seal pathology")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Epibenthos") %>% 
  select(Parameter, Mätvärde)
message("Epibenthos")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Harbour seal") %>% 
  select(Parameter, Mätvärde)
message("Harbour seal")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Grey seal") %>% 
  select(Parameter, Mätvärde)
message("Grey seal")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Zoobenthos") %>% 
  select(Parameter, Mätvärde)
message("Zoobenthos")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Ringed seal") %>% 
  select(Parameter, Mätvärde)
message("Ringed seal")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Sedimentation") %>% 
  select(Parameter, Mätvärde)
message("Sedimentation")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}
datatype = readfile %>% 
  filter(Datatyp == "Harbour Porpoise") %>% 
  select(Parameter, Mätvärde)
message("Harbour Porpoise")
for (i in unique(datatype$Parameter)) {
  print(range(datatype$Mätvärde[which(datatype$Parameter==i)]))
  print(quantile(datatype$Mätvärde[which(datatype$Parameter==i)]))
  lowerq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[2]
  upperq = quantile(datatype$Mätvärde[which(datatype$Parameter==i)])[4]
  iqr = upperq - lowerq
  mild.threshold.upper = (iqr * 1.5) + upperq
  mild.threshold.lower = lowerq - (iqr * 1.5)
  extreme.threshold.upper = (iqr * 3) + upperq
  extreme.threshold.lower = lowerq - (iqr * 3)
  print(data.frame(PARAMETER = i,mild.threshold.upper,mild.threshold.lower,extreme.threshold.upper,extreme.threshold.lower))
  data_vis = datatype %>% 
    filter(Parameter == i) %>% 
    select(Mätvärde)
  p = ggplot(data_vis,aes(y = Mätvärde, x = i))+
    geom_boxplot(outlier.colour = "red")+
    geom_hline(yintercept = mild.threshold.upper, colour = "yellow")+
    geom_hline(yintercept = extreme.threshold.upper, colour = "red")+
    ggtitle(label = i)+
    theme_bw()
  print(p)
}

Reproducibility

# Date time
Sys.time()

# Here we store the session info for this script
sessioninfo::session_info()


sharksmhi/SHARK4R documentation built on Jan. 9, 2025, 5:15 p.m.