inst/pH_Fault_Interpolation.R

library(magrittr)
library(scales)
library(plyr)
library(tidyverse)
library(xts)
library(mvMonitoring)
library(pbapply)

######  Load the Data  ######
load("inst/extdata/pH_fault_data.ave")
D.Ave.q %>% str
D.Ave.q %>% head(1) # Starts 10 April, 2010 at 01:10
D.Ave.q %>% tail(1) # Ends 11 May, 2010 at 01:00
load("/Users/gabriel_odom/Box Sync/Consulting/Dr. Hering/MV_Process_Control/Kazor Code/Raw_Data from Baylor Box/raw_1min_pH_data")
d.1m %>% str
d.1m %>% head(1) # Starts 10 April, 2010 at 00:00
d.1m %>% tail(1) # Ends 11 May, 2010 at 00:00


######  Data Cleaning  #######
pH_10m_df <- data.frame(dateTime = index(D.Ave.q), D.Ave.q)
row.names(pH_10m_df) <- NULL
pH_1m_df <- d.1m
pH_1m_df %<>% tibble::rownames_to_column(var = "dateTime")
pH_1m_df %<>% mutate(dateTime = as.POSIXct(dateTime))

rm(d.1m, T, D.Ave.q)

# NaNs / NAs
# Check for NaNs (they have to be handled differently than NAs)
sapply(1:37, function(i){  # A bunch of NaNs
  sum(is.nan(pH_10m_df[,i]))
})
# Columns 11, 12, and 23 have been seriously degraded with NAs
pH_10m_df[,c(11, 12, 23)] %>% head
# mbr_x_air_pres are missing over half their observations, and mbr_air_scour
# pressure is missing over a third.
sapply(1:24, function(i){  # No NaNs
  sum(is.nan(unique(pH_1m_df[,i])))
})
# All columns (other than dateTime) have NaNs, so we replace them with NAs.
pH_10m_df[,2:ncol(pH_10m_df)] <- rapply(pH_10m_df[,2:37], function(x){
  ifelse(is.nan(x), NA, x)
},
how = "replace")

# Now NAs.
sapply(1:37, function(i){  # How many NAs?
  sum(is.na(pH_10m_df[,i]))
})

sapply(1:24, function(i){
  anyNA(pH_1m_df[,i])
})
# We have NAs in the last two columns of the 1-minute data. How many?
names(pH_1m_df[23:24])
sapply(23:24, function(i){
  sum(is.na(pH_1m_df[,i]))
})
# We don't even have these features in the NOC data. Cut them.
pH_1m_df[23:24] <- NULL
# In the 10-minute data, cut the offending columns
pH_10m_df[,c(11, 12, 23)] <- NULL


######  Graphs  #######

# Graphs
# We have to plot the features first, then determine which kind of variance
# structure to apply to the jittering errors.
obsSeq <- seq(from = 1, to = nrow(pH_1m_df), by = 10)
# Add vertical detection times
adDetect_time <- as.POSIXct("2010-04-21 13:40:00 CDT")
humanDetect_time <- as.POSIXct("2010-04-24 10:00:00 CDT")
pH_names <- sapply(names(pH_1m_df), as.name)

for(i in 2:length(pH_names)){
  xx <- ggplot(data = pH_1m_df[obsSeq,],
               aes(x = dateTime, y = eval(pH_names[[i]]))) +
    scale_y_continuous(name = eval(quote(pH_names[[i]]))) +
    scale_x_datetime(labels = date_format("%m-%d"),
                     minor_breaks = date_breaks("1 day")) +
    geom_point() +
    geom_vline(aes(xintercept = as.numeric(adDetect_time)),
               linetype = 4, colour = "blue", size = 1) +
    geom_vline(aes(xintercept = as.numeric(humanDetect_time)),
               linetype = 4, colour = "red", size = 1)
  print(xx)
}

pH_10m_names <- sapply(names(pH_10m_df), as.name)

for(i in 2:length(pH_10m_names)){
  xx <- ggplot(data = pH_10m_df,
               aes(x = dateTime, y = eval(pH_10m_names[[i]]))) +
    scale_y_continuous(name = eval(quote(pH_10m_names[[i]]))) +
    scale_x_datetime(labels = date_format("%m-%d"),
                     minor_breaks = date_breaks("1 day")) +
    geom_point() +
    geom_vline(aes(xintercept = as.numeric(adDetect_time)),
               linetype = 4, colour = "blue", size = 1) +
    geom_vline(aes(xintercept = as.numeric(humanDetect_time)),
               linetype = 4, colour = "red", size = 1)
  print(xx)
}


######  Initialise Data Frames  ######
MADs_df <- pH_10m_df %>% select(dateTime) # intialise MAD as a data.frame
pH_1min_Seq <- seq.POSIXt(from = pH_10m_df[1,1],
                          to = pH_10m_df[nrow(pH_10m_df),1],
                          by = "min")
pH_1min_Seq <- data.frame(dateTime = pH_1min_Seq)


######  Feature Error Variance Calculation  ######
# NOTE: features which end in "_1min" are the actual interpolated features.
#   features which end in "_jitter" are the actual errors for those features.

# ANOTHER NOTE: GOD DANG IT GABRIEL - you are can act like a real idiot some
#   times. Why didn't you check to see which columns were duplicated in the 10
#   minute data first, so that you wouldn't have to linearly interpolate them?
#   Bloody hell. - Gabriel, on 5th April.

### mbr_x_perm_flow
# mbr_x_perm_flow - strict interpolation
ggplot(data = pH_10m_df, aes(x = dateTime, y = mbr_2_perm_flow)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()

MADs_df$mbr_2_perm_flow <- MADs_df$mbr_1_perm_flow <- 0

### mbr_x_perm_pres
# mbr_x_perm_pres - jittered interpolation; the signal thickness is about 0.25
#   units, so train a constant error variance to match
ggplot(data = pH_10m_df, aes(x = dateTime, y = mbr_1_perm_pres)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
# 1
noNA <- pH_10m_df %>%
  select(dateTime, mbr_1_perm_pres) %>%
  na.omit
mod <- smooth.spline(noNA$mbr_1_perm_pres ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.19 / sqrt(2 / pi)), ylim = c(-2,1.5))

MADs_df$mbr_1_perm_pres <- mod %>% residuals() %>% mad() / sqrt(2 / pi)

# 2
noNA <- pH_10m_df %>%
  select(dateTime, mbr_2_perm_pres) %>%
  na.omit
mod <- smooth.spline(noNA$mbr_2_perm_pres ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.22 / sqrt(2 / pi)), ylim = c(-3,2))

MADs_df$mbr_2_perm_pres <- mod %>% residuals() %>% mad() / sqrt(2 / pi)


### mbr_x_air_flow
# mbr_x_air_flow - after looking only at the values between 5 and 10, the
#   outliers do not mask a strong underlying structure. Leave them.
ggplot(data = pH_10m_df, aes(x = dateTime, y = mbr_2_air_flow)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()

MADs_df$mbr_2_air_flow <- MADs_df$mbr_1_air_flow <- pH_10m_df$mbr_1_air_flow %>%
  mad(na.rm = TRUE) / sqrt(2 / pi)


### mbr_x_tmp
# mbr_x_tmp - jittered interpolation; the signal thickness is about 0.25
#   units, so train a constant error variance to match (sd = 0.5)
ggplot(data = pH_10m_df, aes(x = dateTime, y = mbr_1_tmp)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
# 1
noNA <- pH_10m_df %>%
  select(dateTime, mbr_1_tmp) %>%
  na.omit
mod <- smooth.spline(noNA$mbr_1_tmp ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad() # This overestimates the variance a bit, try
# cutting this in half
plot(1:4300, rnorm(4300, sd = 0.18 / (2 * sqrt(2 / pi))), ylim = c(-1.5,2))

MADs_df$mbr_1_tmp <- mod %>% residuals() %>% mad() / (2 * sqrt(2 / pi))

# 2
noNA <- pH_10m_df %>%
  select(dateTime, mbr_2_tmp) %>%
  na.omit
mod <- smooth.spline(noNA$mbr_2_tmp ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad()# This overestimates the variance a bit, try
# cutting this in half
plot(1:4300, rnorm(4300, sd = 0.202 / (2 * sqrt(2 / pi))), ylim = c(-2,3))

MADs_df$mbr_2_tmp <- mod %>% residuals() %>% mad() / (2 * sqrt(2 / pi))


### ras_temp
# ras_temp - LOCF
ggplot(data = pH_10m_df, aes(x = dateTime, y = ras_temp)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
oneMin_ls <- list(ras_temp = pH_10m_df %>%
                    select(dateTime, ras_temp) %>%
                    left_join(pH_1min_Seq, .) %>%
                    fill(ras_temp, .direction = "down") %>%
                    select(ras_temp))
# the fill() function is apparently smart enough to recognise the original NAs

MADs_df$ras_temp <- 0


### bio_x_do
# bio_x_do - *very* strong signal for values < 0.25 (1) or 0.5 (2). If the
#   dissolved O2 measure is less than 0.25, perform strict linear interpolation?
#   No, LOCF (see bio_x_blow_flow), otherwise add some error with constant
#   sd = 0.1
ggplot(data = pH_10m_df[1:1440,], aes(x = dateTime, y = bio_2_do)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_hline(yintercept = 0.5) +
  geom_point()
# 1
bio_1_do_1min <- pH_10m_df %>%
  select(dateTime, bio_1_do) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_1_do, .direction = "down") %>%
  select(bio_1_do)
# Now add noise for values greater than 0.25. The observations greater than
# 0.25 could easily follow a uniform distribution
bio_1_do_sd <- pH_10m_df %>%
  filter(bio_1_do > 0.25) %>%
  select(bio_1_do) %>%
  as.matrix() %>%
  mad / sqrt(2 / pi)
MADs_df$bio_1_do <- (pH_10m_df %>%
                       select(bio_1_do) %>%
  mutate(bio_1_do = as.numeric(!is.na(bio_1_do) & bio_1_do > 0.25) *
           bio_1_do_sd / 20))[[1]]
bio_1_do_sd_df <- MADs_df %>%
  select(dateTime, bio_1_do) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_1_do, .direction = "down")

bio_1_do_1min$bio_1_do <- sapply(1:nrow(bio_1_do_1min), function(i){
  bio_1_do_1min[i,1] + rnorm(1, sd = bio_1_do_sd_df[i,2])
})

oneMin_ls$bio_1_do <- bio_1_do_1min
rm(bio_1_do_1min, bio_1_do_sd_df, bio_1_do_sd)


# 2
bio_2_do_1min <- pH_10m_df %>%
  select(dateTime, bio_2_do) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_2_do, .direction = "down") %>%
  select(bio_2_do)
# Now add noise for values greater than 0.5. The observations greater than
# 0.5 could easily follow a uniform distribution
bio_2_do_sd <- pH_10m_df %>%
  filter(bio_2_do > 0.5) %>%
  select(bio_2_do) %>%
  as.matrix() %>%
  mad / sqrt(2 / pi)
MADs_df$bio_2_do <- (pH_10m_df %>%
                       select(bio_2_do) %>%
  mutate(bio_2_do = as.numeric(!is.na(bio_2_do) & bio_2_do > 0.5) *
           (bio_2_do_sd / 20)))[[1]]
bio_2_do_sd_df <- MADs_df %>%
  select(dateTime, bio_2_do) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_2_do, .direction = "down")

bio_2_do_1min$bio_2_do <- sapply(1:nrow(bio_2_do_1min), function(i){
  bio_2_do_1min[i,1] + rnorm(1, sd = bio_2_do_sd_df[i,2])
})
oneMin_ls$bio_2_do <- bio_2_do_1min
rm(bio_2_do_1min, bio_2_do_sd_df, bio_2_do_sd)


### mbr_x_inf_flow
# mbr_x_inf_flow - jittered interpolation; fit a cubic spline, and calculate an
#   error variance from the spline residuals
ggplot(data = pH_10m_df, aes(x = dateTime, y = mbr_2_inf_flow)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
# 1
noNA <- pH_10m_df %>%
  select(dateTime, mbr_1_inf_flow) %>%
  na.omit
mod <- smooth.spline(noNA$mbr_1_inf_flow ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.25 / sqrt(2 / pi)), ylim = c(-10,5))

MADs_df$mbr_1_inf_flow <- mod %>% residuals() %>% mad() / sqrt(2 / pi)

# 2
noNA <- pH_10m_df %>%
  select(dateTime, mbr_2_inf_flow) %>%
  na.omit
mod <- smooth.spline(noNA$mbr_2_inf_flow ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.38 / sqrt(2 / pi)), ylim = c(-10,7))

MADs_df$mbr_2_inf_flow <- mod %>% residuals() %>% mad() / sqrt(2 / pi)


### mbr_x_level
# mbr_x_level - remove values less than 9.7 (except for the fault between the
#   21st and the 23rd). This will show the medium-strength signal. Notice that
#   this feature also shows evidence of a multistate process, so use LOCF
ggplot(data = pH_10m_df, aes(x = dateTime, y = mbr_2_level)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  # scale_y_continuous(limits = c(9.7, 10.1)) +
  geom_point()
# 1
naIndex1 <- which(pH_10m_df$mbr_1_level < 9.7 &
                    (pH_10m_df$dateTime < as.POSIXct("2010-04-22 14:50:00 CDT") |
                       pH_10m_df$dateTime > as.POSIXct("2010-04-22 18:10:00 CDT")))
pH_10m_df[naIndex1, "mbr_1_level"] <- NA
oneMin_ls$mbr_1_level <- pH_10m_df %>%
  select(dateTime, mbr_1_level) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(mbr_1_level, .direction = "down") %>%
  select(mbr_1_level)
rm(naIndex1)

# 2
naIndex2 <- which(pH_10m_df$mbr_2_level < 9.25 &
                    (pH_10m_df$dateTime < as.POSIXct("2010-04-22 00:00:00 CDT") |
                       pH_10m_df$dateTime > as.POSIXct("2010-04-22 18:10:00 CDT")))
pH_10m_df[naIndex2, "mbr_2_level"] <- NA
oneMin_ls$mbr_2_level <- pH_10m_df %>%
  select(dateTime, mbr_2_level) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(mbr_2_level, .direction = "down") %>%
  select(mbr_2_level)
rm(naIndex2)

MADs_df$mbr_2_level <- MADs_df$mbr_1_level <- 0


### perm_turb
# perm_turb - jittered interpolation; fit a cubic spline, and calculate an
#   error variance from the spline residuals
ggplot(data = pH_10m_df, aes(x = dateTime, y = perm_turb)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
noNA <- pH_10m_df %>%
  select(dateTime, perm_turb) %>%
  na.omit
mod <- smooth.spline(noNA$perm_turb ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.006 / sqrt(2 / pi)), ylim = c(-0.1,0.6))

MADs_df$perm_turb <- mod %>% residuals() %>% mad() / sqrt(2 / pi)


### sewage_flow
# sewage_flow - LOCF (due to the multi-state structure). Interpolation will
#   distort the underlying multi-state cycles
ggplot(data = pH_10m_df[1:576,], aes(x = dateTime, y = sewage_flow)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
oneMin_ls$sewage_flow <- pH_10m_df %>%
  select(dateTime, sewage_flow) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(sewage_flow, .direction = "down") %>%
  select(sewage_flow)

MADs_df$sewage_flow <- 0


### bio_x_blow_flow
# bio_x_blow_flow - because of the strict cyclical state membership, LOCF is
#   more appropriate than interpolation. This is curious, because I don't know
#   what is driving the state membership. This may also imply that dissolved O2
#   has a similar underlying effect.
ggplot(data = pH_10m_df[1:576,], aes(x = dateTime, y = bio_1_blow_flow)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
oneMin_ls$bio_1_blow_flow <- pH_10m_df %>%
  select(dateTime, bio_1_blow_flow) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_1_blow_flow, .direction = "down") %>%
  select(bio_1_blow_flow)
oneMin_ls$bio_2_blow_flow <- pH_10m_df %>%
  select(dateTime, bio_2_blow_flow) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_2_blow_flow, .direction = "down") %>%
  select(bio_2_blow_flow)

MADs_df$bio_2_blow_flow <- MADs_df$bio_1_blow_flow <- 0


### sewage_level
# sewage_level - jittered interpolation; fit a cubic spline, and calculate an
#   error variance from the spline residuals
ggplot(data = pH_10m_df[288:720,], aes(x = dateTime, y = sewage_level)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
noNA <- pH_10m_df %>%
  select(dateTime, sewage_level) %>%
  na.omit
mod <- smooth.spline(noNA$sewage_level ~ noNA$dateTime)
mod %>% residuals() %>% plot
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.123 / sqrt(2 / pi)), ylim = c(-2,2))

MADs_df$sewage_level <- mod %>% residuals() %>% mad() / sqrt(2 / pi)


### bio_x_level
# bio_x_level - remove values less than 4. This will show a strong but short
#   signal. Interpolate linearly and calculate an error variance from the spline
#   residuals. NOTE: splines won't work - the signal period is too short. Use
#   the "all.knots = TRUE" argument pick out the signal.
ggplot(data = pH_10m_df[1:576,], aes(x = dateTime, y = bio_2_level)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
# 1
pH_10m_df[!is.na(pH_10m_df$bio_1_level) & pH_10m_df$bio_1_level < 4, "bio_1_level"] <- NA
noNA <- pH_10m_df %>%
  select(dateTime, bio_1_level) %>%
  na.omit
mod <- smooth.spline(noNA$bio_1_level ~ noNA$dateTime, all.knots = TRUE)
mod %>% residuals() %>% plot()
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.035 / sqrt(2 / pi)), ylim = c(-0.2,0.2))

MADs_df$bio_1_level <- mod %>% residuals() %>% mad() / sqrt(2 / pi)

# 2
pH_10m_df[!is.na(pH_10m_df$bio_2_level) & pH_10m_df$bio_2_level < 4, "bio_2_level"] <- NA
noNA <- pH_10m_df %>%
  select(dateTime, bio_2_level) %>%
  na.omit
mod <- smooth.spline(noNA$bio_2_level ~ noNA$dateTime, all.knots = TRUE)
mod %>% residuals() %>% plot()
mod %>% residuals() %>% mad()
plot(1:4300, rnorm(4300, sd = 0.024 / sqrt(2 / pi)), ylim = c(-0.15,0.15))

MADs_df$bio_2_level <- mod %>% residuals() %>% mad() / sqrt(2 / pi)


### bio_x_temp
# bio_x_temp - truncate all values < 20, then LOCF; In 2010, Golden had record
#   low temperatures in the first week of May, so the 20F temperatures are real.
#   EDIT: after speaking with Dr. Hering, we can remove the values less than 58F,
#   even though they're real.
ggplot(data = pH_10m_df, aes(x = dateTime, y = bio_2_temp)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
pH_10m_df[!is.na(pH_10m_df$bio_1_temp) & pH_10m_df$bio_1_temp < 58, "bio_1_temp"] <- NA
oneMin_ls$bio_1_temp <- pH_10m_df %>%
  select(dateTime, bio_1_temp) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_1_temp, .direction = "down") %>%
  select(bio_1_temp)
pH_10m_df[!is.na(pH_10m_df$bio_2_temp) & pH_10m_df$bio_2_temp < 58, "bio_2_temp"] <- NA
oneMin_ls$bio_2_temp <- pH_10m_df %>%
  select(dateTime, bio_2_temp) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(bio_2_temp, .direction = "down") %>%
  select(bio_2_temp)

MADs_df$bio_2_temp <- MADs_df$bio_1_temp <- 0


### bio_x_tss
# bio_x_tss - long signal; truncate all values < 3500, log transform, fit
#   a spline, calculate exponential (?) residuals
ggplot(data = pH_10m_df, aes(x = dateTime, y = log(bio_1_tss))) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  # geom_hline(yintercept = 3500) +
  geom_point()
# 1
pH_10m_df[!is.na(pH_10m_df$bio_1_tss) & pH_10m_df$bio_1_tss < 3500, "bio_1_tss"] <- NA
pH_10m_df$bio_1_tss_log <- pH_10m_df$bio_1_tss %>% log
noNA <- pH_10m_df %>%
  select(dateTime, bio_1_tss_log) %>%
  na.omit
mod <- smooth.spline(noNA$bio_1_tss_log ~ noNA$dateTime)
bio_1_tssResid <- mod %>% residuals()
# descdist(bio_1_tssResid)
# fitdist(bio_1_tssResid + 1,
#         "weibull", method = "mle", lower = c(0.001, 0.001),
#         start = list(scale = 1, shape = 1))
# Neither the Weibull nor the Log-Normal match the distribution of the residuals
bio_1_tssResid %>% plot()
# After a two hours of trial and error, I think this mixture distribution fits
# the residuals very well.
plot(1:4300,
     (rnorm(4300, sd = 2.6 *  0.078 / sqrt(2 / pi))) ^ 2 + rnorm(4300, sd = 0.04),
     ylim = c(-0.2,1))
n1min <- nrow(pH_1min_Seq)

bio_1_tss_log_1min_jitter <- (rnorm(n1min,
                                    sd = 2.6 *  0.078 / sqrt(2 / pi))) ^ 2 +
  rnorm(n1min, sd = 0.04)

# 2
pH_10m_df[!is.na(pH_10m_df$bio_2_tss) & pH_10m_df$bio_2_tss < 3500, "bio_2_tss"] <- NA
pH_10m_df$bio_2_tss_log <- pH_10m_df$bio_2_tss %>% log
noNA <- pH_10m_df %>%
  select(dateTime, bio_2_tss_log) %>%
  na.omit
mod <- smooth.spline(noNA$bio_2_tss_log ~ noNA$dateTime)
bio_2_tssResid <- mod %>% residuals()
bio_2_tssResid %>% plot()
plot(1:4300,
     (rnorm(4300, sd = 2.5 *  0.078 / sqrt(2 / pi))) ^ 2 +
       rnorm(4300, sd = 0.035),
     ylim = c(-0.4,0.9))

bio_2_tss_log_1min_jitter <- (rnorm(n1min,
                                    sd = 2.5 *  0.078 / sqrt(2 / pi))) ^ 2 +
  rnorm(n1min, sd = 0.035)

plot(1:n1min, bio_2_tss_log_1min_jitter, ylim = c(-0.4,0.9))
# This mixture overestimates the variance for all dates before 16 April, 2010
# at 11:30AM (index 927).
index_tss <- bio_2_tss_log_1min_jitter > 0.2
index_tss <- index_tss * c(rep(1, 9279),
                           rep(0, length(bio_2_tss_log_1min_jitter) - 9279))

bio_2_tss_log_1min_jitter[which(index_tss == 1)] <- 0
plot(bio_2_tss_log_1min_jitter)

tss_jitter <- pH_1min_Seq
tss_jitter$log_bio1 <- bio_1_tss_log_1min_jitter
tss_jitter$log_bio2 <- bio_2_tss_log_1min_jitter
rm(bio_1_tss_log_1min_jitter, bio_2_tss_log_1min_jitter, index_tss)

MADs_df$bio_2_tss_log <- MADs_df$bio_1_tss_log <- 0


### batch_volume
# batch_volume - LOCF
ggplot(data = pH_10m_df[288:1440,], aes(x = dateTime, y = batch_volume)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
oneMin_ls$batch_volume <- pH_10m_df %>%
  select(dateTime, batch_volume) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(batch_volume, .direction = "down") %>%
  select(batch_volume)

MADs_df$batch_volume <- 0


### perm_tank_level
# perm_tank_level - truncate all values < 2, then LOCF. Observations are highly
# discrete
ggplot(data = pH_10m_df, aes(x = dateTime, y = perm_tank_level)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
pH_10m_df[!is.na(pH_10m_df$perm_tank_level) & pH_10m_df$perm_tank_level < 2,
          "perm_tank_level"] <- NA
oneMin_ls$perm_tank_level <- pH_10m_df %>%
  select(dateTime, perm_tank_level) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(perm_tank_level, .direction = "down") %>%
  select(perm_tank_level)

MADs_df$perm_tank_level <- 0


### ras_do
# ras_do - LOCF
ggplot(data = pH_10m_df, aes(x = dateTime, y = ras_do)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
oneMin_ls$ras_do <- pH_10m_df %>%
  select(dateTime, ras_do) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(ras_do, .direction = "down") %>%
  select(ras_do)

MADs_df$ras_do <- 0


### ras_ph
# ras_ph - truncate all values < 5.4, then LOCF
ggplot(data = pH_10m_df, aes(x = dateTime, y = ras_ph)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
pH_10m_df[!is.na(pH_10m_df$ras_ph) &
            (pH_10m_df$ras_ph < 5.4 | pH_10m_df$ras_ph > 9),
          "ras_ph"] <- NA
oneMin_ls$ras_ph <- pH_10m_df %>%
  select(dateTime, ras_ph) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(ras_ph, .direction = "down") %>%
  select(ras_ph)

MADs_df$ras_ph <- 0


### ras_tss
# ras_tss - long signal; truncate all values < 3500, log transform, fit a
#   spline, calculate day-blocked variances, then fit a (linear?) model to the
#   block-specific variances. NO. After spline fitting the logged values, we
#   only see evidence of two variance structures: before and after 16 April,
#   2010, at 11:30AM (index 927 (10 min) / 9279 (1 min)).
ggplot(data = pH_10m_df, aes(x = dateTime, y = ras_tss)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  # geom_hline(yintercept = 3500) +
  geom_point()
pH_10m_df[!is.na(pH_10m_df$ras_tss) & pH_10m_df$ras_tss < 3500, "ras_tss"] <- NA
pH_10m_df$ras_tss_log <- pH_10m_df$ras_tss %>% log
noNA <- pH_10m_df %>%
  select(dateTime, ras_tss_log) %>%
  na.omit
mod <- smooth.spline(noNA$ras_tss_log ~ noNA$dateTime)
ras_tssResid <- mod %>% residuals()
ras_tssResid[1:927] %>% plot(ylim = c(-1, 0.5))
ras_tssResid[1:927] %>% mad
plot(1:927, rnorm(927, sd = 0.186 / (1.5 * sqrt(2 / pi))), ylim = c(-1,0.5))
ras_tssResid[928:4456] %>% plot(ylim = c(-1, 0.5))
ras_tssResid[928:4456] %>% mad
plot(928:4456, rnorm(4456 - 928 + 1, sd = 0.053 / sqrt(2 / pi)), ylim = c(-1,0.5))
# Now put them together
ras_tss_1min_jitter <- c(rnorm(9279, sd = 0.186 / (1.5 * sqrt(2 / pi))),
                         rnorm(n1min - 9279, sd = 0.053 / sqrt(2 / pi)))
ras_tssResid %>% plot(ylim = c(-1, 0.5))
ras_tss_1min_jitter %>% plot(ylim = c(-1, 0.5))
tss_jitter$log_ras <- ras_tss_1min_jitter
rm(ras_tss_1min_jitter)

MADs_df$ras_tss_log <- 0


### perm_cond
# perm_cond - truncate all values < 625, then linearly interpolate with error
#   variances from the residuals of a cubic spline fit. Clearly defined and
#   strong signal
ggplot(data = pH_10m_df, aes(x = dateTime, y = perm_cond)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  # geom_hline(yintercept = 625) +
  geom_point()
pH_10m_df[!is.na(pH_10m_df$perm_cond) & pH_10m_df$perm_cond < 625, "perm_cond"] <- NA
noNA <- pH_10m_df %>%
  select(dateTime, perm_cond) %>%
  na.omit
mod <- smooth.spline(noNA$perm_cond ~ noNA$dateTime)
mod %>% residuals() %>% plot(ylim = c(-50, 50))
mod %>% residuals() %>% mad
plot(1:4300, rnorm(4300, sd = 1.560 / sqrt(2 / pi)), ylim = c(-50,50))

MADs_df$perm_cond <- mod %>% residuals() %>% mad() / sqrt(2 / pi)


### ambient_temp
# ambient_temp - truncate all values < 30, the strict linear interpolation. See
#   bio_x_temp for more.
ggplot(data = pH_10m_df, aes(x = dateTime, y = ambient_temp)) +
  scale_x_datetime(labels = date_format("%m-%d"),
                   minor_breaks = date_breaks("1 day")) +
  geom_point()
pH_10m_df[!is.na(pH_10m_df$ambient_temp) &
            pH_10m_df$ambient_temp < 30,
          "ambient_temp"] <- NA
oneMin_ls$ambient_temp <- pH_10m_df %>%
  select(dateTime, ambient_temp) %>%
  left_join(pH_1min_Seq, .) %>%
  fill(ambient_temp, .direction = "down") %>%
  select(ambient_temp)

MADs_df$ambient_temp <- 0

######  The jitterInterpolate() Function  ######
# # We need to modify the way the jitterInterpolate function handles features with
# # variance equal to 0. The MASS::mvrnorm function can't.
# testMASS <- mvrnorm(n = 900, mu = rep(0, 34),
#         Sigma = diag((1 / 1:34) ^ 2, ncol = 34, nrow = 34))
# testBase <- sapply(rep(c(1,0),5), function(sigma){
#   rnorm(n = 5, mean = 0, sd = sigma)
# })
# testMASS[,6] %>% plot(ylim = c(-1, 1))
# testBase[,6] %>% plot(ylim = c(-1, 1))

jitterInterpolate <- function(data, SDs_df, index, numInterpol){
  # data = a time series data frame with the first column as the time index
  # SDs_df = a data frame of standard deviations. has the same time index as
  # data
  # index = we interpolate values between index and index + 1
  # numInterpol = how many values will we interpolate
  require(dplyr)

  p <- data %>% ncol() - 1
  data_names <- colnames(data) %>% sapply(as.name)

  mads_vec <- sapply(2:(p + 1), function(i){
    SDs_df[index,] %>% select(eval(data_names[[i]])) %>% as.numeric()
  })

  # Generate the jitter values
  # The MASS::mvrnorm function requires a PD covariance matrix, which does not
  #   allow for feature variance to equal 0. For features under strict linear
  #   interpolation or LOCF, we need to generate "errors" form a degenerate
  #   distribution, so the jitter is constantly 0.
  jitter_mat <- sapply(mads_vec, function(sigma){
    rnorm(n = numInterpol, mean = 0, sd = sigma)
  })
  # jitter_mat <- mvrnorm(n = numInterpol,
  #                       mu = rep(0, p),
  #                       Sigma = diag(mads_vec ^ 2, ncol = p, nrow = p))
  # The columns of the jitter matrix are the error values to add to the
  # interpolated values between index and index + 1. Our covariance matrix is
  # diagonal because we assume local independence of the features.

  # Now for the interpolation. I couldn't find an easy way to interpolate
  # between two vectors, so we're going to bind some scalar interpolations
  # together
  interpol_mat <- matrix(0, nrow = numInterpol, ncol = p)
  for(i in 1:p){
    # If we have 1 NA, we'll hit this if statement
    if(anyNA(data[index:(index + 1), i + 1])){
      # Now that we know one of the values are NA, are they both?
      if(all(is.na(data[index:(index + 1), i + 1]))){
        # If all values are NA, then the values between should be NA too
        interpol_mat[,i] <- rep(NA, numInterpol)
      }else{
        # If they are not both NA, then impute a constant between
        interpol_mat[,i] <- approx(data[index:(index + 1), i + 1],
                                   method = "constant",
                                   n = numInterpol)$y
      }
    }else{
      interpol_mat[,i] <- approx(data[index:(index + 1), i + 1],
                                 method = "linear",
                                 n = numInterpol)$y
    }
  }


  # We now have interpolated values and a matrix of errors of the same size,
  # let's add them together
  drift_mat <- interpol_mat + jitter_mat
  # Some of the values don't make real sense - it's possible, for instance, to
  # generate negative values from the white noise distribution. Because of the
  # physical design of the system, negative values (for the most part) are
  # impossible. We will have to truncate these values after generation.
  drift_mat[drift_mat < 0] <- 0

  # We can use the seq.POSIXt function to create a sequences of times between
  # time index and time index + 1
  dateTime <- seq.POSIXt(from = data[index,1],
                         to = data[(index + 1),1],
                         length.out = numInterpol + 2)[2:(numInterpol + 1)]
  dateTime <- data.frame(dateTime = dateTime)

  # We can now bind the POSIX information to the drift matrix
  interpolated_df <- as.data.frame(drift_mat)
  interpolated_df <- bind_cols(dateTime, interpolated_df)
  names(interpolated_df) <- names(data)
  bind_rows(data[index,], interpolated_df)
}


######  Downscaling  ######
# Now that we have the MADs data frame (which is technically the standard
# deviations, not the MADs), and the feature-specific data frames for the LOCF
# interpolation, we can put these all together with the jitterInterpolate()
# function from Data_Cleaning_and_Interpolation_Summary.Rmd
names(pH_10m_df) %in% names(MADs_df)
ncol(pH_10m_df)

pH_10to1_df <- pblapply(1:(nrow(pH_10m_df) - 1),
                        function(i) {
                          jitterInterpolate(data = pH_10m_df %>%
                                              select(-bio_1_tss,
                                                     -bio_2_tss,
                                                     -ras_tss),
                                            SDs_df = MADs_df,
                                            index = i,
                                            numInterpol = 9)
                          }) %>% ldply
pH_10to1_df <- bind_rows(pH_10to1_df, pH_10m_df[4464,])

# Add in the hard-coded jitters (for the three tss variables)
pH_10to1_df$bio_1_tss_log <- pH_10to1_df$bio_1_tss_log +
  tss_jitter$log_bio1
pH_10to1_df$bio_2_tss_log <- pH_10to1_df$bio_2_tss_log +
  tss_jitter$log_bio2
# There is a hard ceiling on ras_tss, so we need to replicate that. Replace all
# values greater than 9.9 (on the log scale) with 9.9.
pH_10to1_df$ras_tss_log <- pH_10to1_df$ras_tss_log +
  tss_jitter$log_ras
pH_10to1_df$ras_tss_log[!is.na(pH_10to1_df$ras_tss_log) &
                          pH_10to1_df$ras_tss_log > 9.9] <- 9.9

# Let's visually check this interpolation
names(pH_10m_df)
ggplot(data = pH_10m_df,
       aes(x = dateTime, y = ras_tss_log)) +
  geom_point()
# If we used jittering, pull the feature from here
ggplot(data = pH_10to1_df,
       aes(x = dateTime, y = ras_tss_log)) +
  geom_point(alpha = 0.1)
# If we created the feature specifically, look at it here. Features created are:
#   ras_temp_1min, bio_1_do_1min, bio_2_do_1min, mbr_1_level, mbr_2_level,
#   sewage_flow_1min, bio_1_blow_flow_1min, bio_2_blow_flow_1min, bio_1_temp_1min,
#   bio_2_temp_1min, perm_tank_level_1min, ras_do_1min, ras_ph_1min, and
#   ambent_temp_1min
# Overall, I think we're downscaling very well.

######  Combine the Interpolated Data Frames  ######
oneMin_df <- oneMin_ls %>% bind_cols()
rm(oneMin_ls)

pH_inter_df <- pH_10to1_df %>% left_join(pH_1min_Seq, .)
pH_inter_df$ambient_temp <- oneMin_df$ambient_temp
pH_inter_df$batch_volume <- oneMin_df$batch_volume
pH_inter_df$bio_1_blow_flow <- oneMin_df$bio_1_blow_flow
pH_inter_df$bio_1_do <- oneMin_df$bio_1_do
pH_inter_df$bio_1_temp <- oneMin_df$bio_1_temp
pH_inter_df$bio_2_blow_flow <- oneMin_df$bio_2_blow_flow
pH_inter_df$bio_2_do <- oneMin_df$bio_2_do
pH_inter_df$bio_2_temp <- oneMin_df$bio_2_temp
pH_inter_df$mbr_1_level <- oneMin_df$mbr_1_level
pH_inter_df$mbr_2_level <- oneMin_df$mbr_2_level
pH_inter_df$perm_tank_level <- oneMin_df$perm_tank_level
pH_inter_df$ras_do <- oneMin_df$ras_do
pH_inter_df$ras_ph <- oneMin_df$ras_ph
pH_inter_df$ras_temp <- oneMin_df$ras_temp
pH_inter_df$sewage_flow <- oneMin_df$sewage_flow
pH_inter_df$bio_1_tss <- NULL
pH_inter_df$bio_2_tss <- NULL
pH_inter_df$ras_tss <- NULL

#######  Save the 10-minute Data  ######
write_csv(pH_inter_df, path = "pH_Fault_interpolated_20170405.csv")

#######  Join the Data Frames  ######
pH_1m_df %>% str
pH_inter_df %>% str

# We need to round the POSIX column to the nearest minute.
pH_1m_df$dateTime %>% head
pH_1m_df$dateTime %>% round.POSIXt(units = "mins") %>% head
pH_1m_df$dateTime_round <- round.POSIXt(pH_1m_df$dateTime, units = "mins") %>%
  as.POSIXct() # round the times to the nearest minute. We now have 36 rows with
# duplicate time stamps
# Aggregate the duplicate rows by applying the "mean()" function with the
# numcolwise() call. This takes about 95 seconds
pH_1m_round_df <- ddply(pH_1m_df, "dateTime_round", numcolwise(mean))
# Rename the date-time column so we can join the two data frames
pH_1m_round_df %<>% rename(dateTime = dateTime_round)

# Remove comlumns in pH_inter_df which are duplicates of columns in pH_1m_df
names(pH_inter_df)
names(pH_1m_df)
pH_inter_clean_df <- pH_inter_df %>% select(-bio_1_do, -bio_2_do,
                                            -mbr_1_level, -mbr_2_level,
                                            -mbr_1_tmp, -mbr_2_tmp,
                                            -perm_turb, -sewage_flow,
                                            -bio_1_blow_flow, -bio_2_blow_flow,
                                            -mbr_1_perm_flow, -mbr_2_perm_flow,
                                            -mbr_1_inf_flow, -mbr_2_inf_flow)

# Full outer join the data frames
pH_Fault_df <- full_join(pH_1m_round_df, pH_inter_clean_df, by = "dateTime")

# Inner join the data frames
pH_Fault_Inner_df <- inner_join(pH_1m_round_df,
                                pH_inter_clean_df,
                                by = "dateTime")

######  Save the Joined Data  ######
write_csv(pH_Fault_Inner_df, path = "pH_Fault_InnerJoin_20170405.csv")
write_csv(pH_Fault_df, path = "pH_Fault_OuterJoin_20170405.csv")

pH_Fault_Inner_df <- read_csv("C:/Users/gabriel_odom/Box Sync/Consulting/Dr. Hering/MV_Process_Control/Kazor Code/pH_Fault_InnerJoin_20170405.csv")

# Missing Data
sum(is.na(pH_Fault_Inner_df$sewage_level))
sum(is.na(pH_Fault_Inner_df$mbr_1_perm_pres))
sum(is.na(pH_Fault_Inner_df$mbr_2_perm_pres))
sum(is.na(pH_Fault_Inner_df$mbr_1_air_flow))
sum(is.na(pH_Fault_Inner_df$mbr_2_air_flow))

pH_df <- na.omit(pH_Fault_Inner_df)

######  Graphs Part 2  ######
adDetect_time <- as.POSIXct("2010-04-21 13:40:00 CDT")
humanDetect_time <- as.POSIXct("2010-04-24 10:00:00 CDT")
pH_names_full <- sapply(names(pH_df), as.name)

for(i in 2:length(pH_names_full)){
  xx <- ggplot(data = pH_df,
               aes(x = dateTime, y = eval(pH_names_full[[i]]))) +
    scale_y_continuous(name = eval(quote(pH_names_full[[i]]))) +
    scale_x_datetime(labels = date_format("%m-%d"),
                     minor_breaks = date_breaks("1 day")) +
    geom_point(alpha = 0.1) +
    geom_vline(aes(xintercept = as.numeric(adDetect_time)),
               linetype = 4, colour = "blue", size = 1) +
    geom_vline(aes(xintercept = as.numeric(humanDetect_time)),
               linetype = 4, colour = "red", size = 1)
  print(xx)
}

# Based on these graphs, I really think we need to start training the model on
# the 16th of April, not the 10th (when the data starts).

Try the mvMonitoring package in your browser

Any scripts or data that you put into this service are public.

mvMonitoring documentation built on Nov. 22, 2023, 1:09 a.m.