In queelius/masked.data: Estimating Reliability of Components in Series from Masked Data with Relaxed Candidate Set Models

knitr::opts_chunk$set(echo = TRUE)

library(algebraic.mle)
library(dplyr)
library(ggplot2)
library(reshape2)
library(knitr)
library(md.tools)
library(md.series.system)
library(utils)
library(tidyverse)
library(gridExtra)

# this is the actual code i ran to generate the data set for
# `exp_experiment_4.csv`. this was done before i generalized the approach.
# i'm keeping all of the code the same since this is actually what was run
# to generate the data set, but it should be the same as what i posted
# below in block `run-exp-experiment-4`

generate_data <- function(n, theta, tau, p) {
    m <- length(theta)
    comp_times <- matrix(nrow=n,ncol=m)
    for (j in 1:m)
        comp_times[,j] <- rexp(n,theta[j])
    comp_times <- md_encode_matrix(comp_times,"t")

    comp_times %>%
        md_series_lifetime_right_censoring(tau) %>%
        md_bernoulli_cand_C1_C2_C3(p) %>%
        md_cand_sampler()
}

custom_solver <- function(data, theta, extra_info = NULL, annealing = TRUE) {
    ll <- md_loglike_exp_series_C1_C2_C3(data)
    ll.grad <- md_score_exp_series_C1_C2_C3(data)
    fish <- md_fim_exp_series_C1_C2_C3(data)
    ll.hess <- function(x) -fish(x)
    theta.hat <- NULL
    start <- NULL
    m <- length(theta)

    tryCatch({
        start <- list(par = theta)
        if (annealing) {
            start <- sim_anneal(par = theta, fn = ll, control =
                list(fnscale = -1, maxit = 100000L, trace = FALSE,
                    t_init = 20, 1e-3, alpha = 0.95,
                    it_per_temp = 50L,
                    neigh = function(par, temp, ...) {
                        tt <- min(temp, 1)
                        par + rnorm(m, 0, tt)
                    },
                    sup = function(x) all(x > 0)))
        }
        res_optim <- optim(
            par = start$par,
            fn = ll,
            gr = ll.grad,
            method = "L-BFGS-B",
            lower = 1e-30,
            hessian = FALSE,
            control = list(fnscale = -1, maxit = 1000L))
        theta.hat <- mle_numerical(res_optim,
            options = list(hessian = ll.hess(res_optim$par)))
    }, error = function(e) {
        cat("Sample size", nrow(data), " | Anneal:",
            start$par, " | ", e$message)
        if (!is.null(extra_info)) {
            cat(" | ", extra_info)
        }
        cat("\n")
    })
    theta.hat
}

exp_experiment_4_gen <- function(
    csv_filename,
    R = 1000,
    bernoulli_probs = c(.2, .3, .4, .5, .6, .7, .8, .9),
    q = .25,
    sample_size = 100,
    append = TRUE,
    use_aneal_start = TRUE) {

    set.seed(32861) # set seed for reproducibility
    theta <- c(1,   # component 1 failure rate
            1.1,    # 2
            0.98,   # 3
            1.12,   # 4
            1.05)   # 5

    m <- length(theta)
    tau <- -(1/sum(theta))*log(q)

    if (!append) {
        cnames <- c("R", "p", "tau", "N", paste0("bias",1:m), "mse",
            paste0("se",1:m), paste0("se_asym",1:m), "mse_asym", "mse_asym_hat",
            paste0("coverage",1:m))##

        # Write column names first
        write.table(t(cnames), file = csv_filename,
            sep = ",", col.names = FALSE,
            row.names = FALSE, append = FALSE, quote = FALSE)
    }

    for (i in 1:length(bernoulli_probs)) {
        p <- bernoulli_probs[i]
        cat("Starting simulations for Bernoulli probability", p, "\n")
        mles <- matrix(nrow = R, ncol = m)

        # For storing the lower and upper bounds of the confidence intervals
        CI_lwr <- matrix(nrow = R, ncol = m)
        CI_upr <- matrix(nrow = R, ncol = m)

        j <- 1L
        repeat {
            data <- generate_data(sample_size, theta, tau, p)
            theta.hat <- custom_solver(
                data = data,
                theta = theta,
                extra_info = paste0("Replicate(", j, ")"),
                annealing = use_aneal_start)
            if (is.null(theta.hat)) {
                next
            }
            if (j %% 10 == 0) {
                cat("Sample size", sample_size, " | Replicate ", j,
                    " | MLE ", point(theta.hat), "\n")
            }
            mles[j, ] <- point(theta.hat)

            CI <- confint(theta.hat)
            if (any(is.nan(CI)))
            {
                print("NaN in CI")
                print(summary(theta.hat))
            }
            CI_lwr[j, ] <- CI[,1]
            CI_upr[j, ] <- CI[,2]

            j <- j + 1L
            if (j > R) {
                break
            }
        }

        # compute asymptotics
        theta.mle <- NULL
        while (is.null(theta.mle)) {
            data <- generate_data(sample_size, theta, tau, p)
            theta.mle <- custom_solver(
                data = data,
                theta = theta,
                extra_info = "asymptotics",
                annealing = use_aneal_start)
        }
        SE.asym <- se(theta.mle)
        MSE.asym <- mse(theta.hat, theta)
        MSE.asym.hat <- mse(theta.mle)

        # Calculate bias, MSE, and SE for each parameter
        bias <- colMeans(mles) - theta
        MSE <- sum(colMeans((mles - theta)^2))
        sigma <- cov(mles) * ((sample_size - 1) / sample_size)
        SE <- sqrt(diag(sigma))

        # Compute coverage probabilities
        coverage <- colMeans((CI_lwr <= theta) & (theta <= CI_upr))

        datum <- c(R, p, tau, sample_size, bias, MSE, SE, SE.asym,
            MSE.asym, MSE.asym.hat, coverage)

        write.table(t(datum), file = csv_filename, sep = ",", col.names = FALSE,
            row.names = FALSE, append = TRUE)
    }
}

exp_experiment_4_gen(
    csv_filename = "exp_experiment_4.csv",
    append = TRUE,
    use_aneal_start = TRUE)

The purpose of this data set is to analyze the sensivity of the exponential series system (5 components, all roughly the same failure rate) with respect to a change in the Bernoulli probabilities for moderately sized sample sizes.

Here is how we generated this data set (we do not evaluate this code, since we already generated the data set and saved it to a file):

source("exp_experiment_gen.R")
exp_experiment_gen(
    R = 1000,
    bernoulli_probs = c(.2, .3, .4, .5, .6, .7, .8, .9),
    quants = .25,
    sample_sizes = 100,
    theta = c(1, 1.1, 0.98, 1.12, 1.05),
    seed = 32861,
    use_aneal_start = TRUE,
    append = FALSE,
    csv_filename = "exp_experiment_4.csv")

We read the data set from the file with:

df <- read.csv("exp_experiment_4.csv")

kable(df[1:5, 1:10])
kable(df[1:5, 11:20])
kable(df[1:5, 21:ncol(df)])

Coverage Probabilities

Let's visualize the coverage probabilities:

###################### coverage probability ##################
df_long <- df %>% pivot_longer(
    cols = starts_with("coverage"),
    names_to = "Coverage",
    values_to = "Value")

# Convert the "Coverage" column to a factor for better plotting
df_long$Coverage <- as.factor(df_long$Coverage)

# Plot the coverage probabilities, with a solid line for the
# 95% confidence level
ggplot(df_long, aes(x = p, y = Value, color = Coverage)) +
  geom_point() +
  geom_line() +
  geom_hline(yintercept = .95, linetype = "dashed") +
  labs(x = "Bernoulli Probability (p)", y = "Coverage Probability",
    title = "Coverage Probability vs Bernoulli Probability",
    color = "Coverage")

Analysis

Analysis here

Bias

df_long <- df %>% pivot_longer(
    cols = starts_with("bias"),
    names_to = "Bias",
    values_to = "Value")

# Convert the "Bias" column to a factor for better plotting
df_long$Bias <- as.factor(df_long$Bias)

# Plot,
ggplot(df_long, aes(x = p, y = Value, color = Bias)) +
  geom_point() +
  geom_line() +
  labs(x = "Bernoulli Probability (p)", y = "Bias",
       title = "Bias vs Bernoulli Pobability", color = "Bias")

Analysis

Analysis here

MSE

ggplot(df) +
  geom_point(aes(x = p, y = mse)) +
  geom_line(aes(x = p, y = mse, color = "Simulation")) +
  geom_line(aes(x = p, y = mse_asym,
                color = "Asymptotic"), linetype="dashed") +
  labs(x = "Bernoulli Probability (p)", y = "Mean Squared Error",
       title = "Mean Squared Error vs Bernoulli Probability")

Analysis

Analysis here

SE

df_long <- df %>% pivot_longer(
    # regex to match column names "se<digit>"
    cols = matches("se[0-9]"),
    names_to = "SE",
    values_to = "Value")

# Convert the "Coverage" column to a factor for better plotting
df_long$SE <- as.factor(df_long$SE)

# Plot
ggplot(df_long, aes(x = p, y = Value, color = SE)) +
  geom_point() +
  geom_line() +
  labs(x = "Bernoulli Probability (p)", y = "SE",
       title = "SE vs Bernoulli Probability", color = "SE")

Analysis

Analysis here

SE asymptotics

df_long <- df %>% pivot_longer(
    # regex to match column names "se<digit>"
    cols = starts_with("se_asym"),
    names_to = "SE",
    values_to = "Value")

# Convert the "Coverage" column to a factor for better plotting
df_long$SE <- as.factor(df_long$SE)

# Plot
ggplot(df_long, aes(x = p, y = Value, color = SE)) +
  geom_point() +
  geom_line() +
  labs(x = "Bernoulli Probability (p)", y = "SE",
       title = "SE vs Bernoulli Probability", color = "SE")