In queelius/masked.data: Estimating Reliability of Components in Series from Masked Data with Relaxed Candidate Set Models

knitr::opts_chunk$set(echo = TRUE)

library(algebraic.mle)
library(dplyr)
library(ggplot2)
library(reshape2)
library(knitr)
library(md.tools)
library(md.series.system)
library(utils)
library(tidyverse)
library(gridExtra)

# this is the actual code i ran to generate the data set for
# `exp_experiment_5.csv`. this was done before i generalized the approach.
# i'm keeping all of the code the same since this is actually what was run
# to generate the data set, but it should be the same as what i posted
# below in block `run-exp-experiment-5`

generate_data <- function(n, theta, tau, p) {
    m <- length(theta)
    comp_times <- matrix(nrow=n,ncol=m)
    for (j in 1:m)
        comp_times[,j] <- rexp(n,theta[j])
    comp_times <- md_encode_matrix(comp_times,"t")

    comp_times %>%
        md_series_lifetime_right_censoring(tau) %>%
        md_bernoulli_cand_C1_C2_C3(p) %>%
        md_cand_sampler()
}

custom_solver <- function(data, theta, extra_info = NULL, annealing = TRUE) {
    ll <- md_loglike_exp_series_C1_C2_C3(data)
    ll.grad <- md_score_exp_series_C1_C2_C3(data)
    fish <- md_fim_exp_series_C1_C2_C3(data)
    ll.hess <- function(x) -fish(x)
    theta.hat <- NULL
    start <- NULL
    m <- length(theta)

    tryCatch({
        start <- list(par = theta)
        if (annealing) {
            start <- sim_anneal(par = theta, fn = ll, control =
                list(fnscale = -1, maxit = 10000L, trace = FALSE,
                    t_init = 1, 1e-2, alpha = 0.9,
                    it_per_temp = 10L,
                    sup = function(x) all(x > 0)))
        }
        res_optim <- optim(
            par = start$par,
            fn = ll,
            gr = ll.grad,
            method = "L-BFGS-B",
            lower = 1e-30,
            hessian = FALSE,
            control = list(fnscale = -1, maxit = 1000L))
        theta.hat <- mle_numerical(res_optim,
            options = list(hessian = ll.hess(res_optim$par)))
    }, error = function(e) {
        cat("Sample size", nrow(data), " | Anneal:",
            start$par, " | ", e$message)
        if (!is.null(extra_info)) {
            cat(" | ", extra_info)
        }
        cat("\n")
    })
    theta.hat
}

exp_experiment_5_gen <- function(
    csv_filename,
    R = 999,
    bernoulli_probs = c(.2, .3, .4, .5),
    quants = c(.1, .2, .3, .4, .5),
    sample_size = c(25, 50, 100, 200, 400),
    append = TRUE,
    use_aneal_start = TRUE) {

    set.seed(32861) # set seed for reproducibility
    theta <- c(1,   # component 1 failure rate
            1.1,    # 2
            0.98,   # 3
            1.12,   # 4
            1.05)   # 5

    m <- length(theta)

    if (!append) {
        cnames <- c("R", "p", "tau", "q", "N", paste0("bias",1:m), "mse",
            paste0("se",1:m), paste0("se_asym",1:m), "mse_asym", "mse_asym_hat",
            paste0("coverage",1:m))

        # Write column names first
        write.table(t(cnames), file = csv_filename,
            sep = ",", col.names = FALSE,
            row.names = FALSE, append = FALSE, quote = FALSE)
    }

    for (i in 1:length(bernoulli_probs)) {
        p <- bernoulli_probs[i]
        cat("Starting simulations for Bernoulli probability", p, "\n")

        for (j in 1:length(sample_size)) {
            N <- sample_size[j]
            cat("Starting simulations for sample size", N, "\n")

            for (k in 1:length(quants)) {
                q <- quants[k]
                cat("Starting simulations for quantile", q, "\n")
                tau <- -(1/sum(theta))*log(q)
                mles <- matrix(nrow = R, ncol = m)
                CI_lwr <- matrix(nrow = R, ncol = m)
                CI_upr <- matrix(nrow = R, ncol = m)

                j <- 1L
                repeat {
                    data <- generate_data(N, theta, tau, p)
                    theta.hat <- custom_solver(
                        data = data,
                        theta = theta,
                        extra_info = paste0("Replicate(", j, ")"),
                        annealing = use_aneal_start)
                    if (is.null(theta.hat)) {
                        next
                    }
                    if (j %% 10 == 0) {
                        cat("Sample size", N, " | Replicate ", j,
                            " | MLE ", point(theta.hat), "\n")
                    }
                    mles[j, ] <- point(theta.hat)

                    CI <- confint(theta.hat)
                    if (any(is.nan(CI)))
                    {
                        print("NaN in CI")
                        print(summary(theta.hat))
                    }
                    CI_lwr[j, ] <- CI[,1]
                    CI_upr[j, ] <- CI[,2]

                    j <- j + 1L
                    if (j > R) {
                        break
                    }
                }

                # compute asymptotics
                theta.mle <- NULL
                while (is.null(theta.mle)) {
                    data <- generate_data(N, theta, tau, p)
                    theta.mle <- custom_solver(
                        data = data,
                        theta = theta,
                        extra_info = "asymptotics",
                        annealing = use_aneal_start)
                }
                SE.asym <- se(theta.mle)
                MSE.asym <- mse(theta.hat, theta)
                MSE.asym.hat <- mse(theta.mle)

                # Calculate bias, MSE, and SE for each parameter
                bias <- colMeans(mles) - theta
                MSE <- sum(colMeans((mles - theta)^2))
                sigma <- cov(mles) * ((sample_size - 1) / sample_size)
                SE <- sqrt(diag(sigma))

                # Compute coverage probabilities
                coverage <- colMeans((CI_lwr <= theta) & (theta <= CI_upr))

                datum <- c(R, p, tau, q, N, bias, MSE, SE, SE.asym,
                    MSE.asym, MSE.asym.hat, coverage)

                write.table(t(datum), file = csv_filename, sep = ",",
                    col.names = FALSE, row.names = FALSE, append = TRUE)
            }
        }
    }
}

exp_experiment_5_gen(
    append = FALSE,
    csv_filename = "exp_experiment_5.csv",
    use_aneal_start = TRUE)

The purpose of this data set is to analyze the sensivity of the exponential series system (5 components, all roughly the same failure rate) with respect to a change in:

- Bernoulli probabilities (bernoulli_probs)
- Sample sizes (sample_sizes)
- Right censoring times (quants)

Here is how we generated this data set (we do not evaluate this code, since we already generated the data set and saved it to a file):

source("exp_experiment_gen.R")
exp_experiment_gen(
    R = 999,
    bernoulli_probs = c(.2, .3, .4, .5),
    quants = c(.1, .2, .3, .4, .5),
    sample_size = c(25, 50, 100, 200, 400),
    append = FALSE,
    seed = 32861,
    use_aneal_start = TRUE,
    theta = c(1, 1.1, 0.98, 1.12, 1.05),
    csv_filename = "exp_experiment_5.csv")

We read the data set from the file with:

df <- read.csv("exp_experiment_5.csv")
kable(df[1:5, 1:10])
kable(df[1:5, 11:20])
kable(df[1:5, 21:ncol(df)])

Coverage Probabilities

Let's visualize the coverage probabilities:

###################### coverage probability ##################
df_long <- df %>% pivot_longer(
    cols = starts_with("coverage"),
    names_to = "Coverage",
    values_to = "Value")

# Convert the "Coverage" column to a factor for better plotting
df_long$Coverage <- as.factor(df_long$Coverage)

# Plot the coverage probabilities, with a solid line for the
# 95% confidence level
ggplot(df_long, aes(x = p, y = Value, color = Coverage)) +
  geom_point() +
  geom_line() +
  geom_hline(yintercept = .95, linetype = "dashed") +
  labs(x = "Bernoulli Probability (p)", y = "Coverage Probability",
    title = "Coverage Probability vs Bernoulli Probability",
    color = "Coverage")

Analysis

Analysis here

Bias

df_long <- df %>% pivot_longer(
    cols = starts_with("bias"),
    names_to = "Bias",
    values_to = "Value")

# Convert the "Bias" column to a factor for better plotting
df_long$Bias <- as.factor(df_long$Bias)

# Plot,
ggplot(df_long, aes(x = p, y = Value, color = Bias)) +
  geom_point() +
  geom_line() +
  labs(x = "Bernoulli Probability (p)", y = "Bias",
       title = "Bias vs Bernoulli Pobability", color = "Bias")

Analysis

Analysis here

MSE

ggplot(df) +
  geom_point(aes(x = p, y = mse)) +
  geom_line(aes(x = p, y = mse, color = "Simulation")) +
  geom_line(aes(x = p, y = mse_asym,
                color = "Asymptotic"), linetype="dashed") +
  labs(x = "Bernoulli Probability (p)", y = "Mean Squared Error",
       title = "Mean Squared Error vs Bernoulli Probability")

Analysis

Analysis here

SE

df_long <- df %>% pivot_longer(
    # regex to match column names "se<digit>"
    cols = matches("se[0-9]"),
    names_to = "SE",
    values_to = "Value")

# Convert the "Coverage" column to a factor for better plotting
df_long$SE <- as.factor(df_long$SE)

# Plot
ggplot(df_long, aes(x = p, y = Value, color = SE)) +
  geom_point() +
  geom_line() +
  labs(x = "Bernoulli Probability (p)", y = "SE",
       title = "SE vs Bernoulli Probability", color = "SE")

Analysis

Analysis here

SE asymptotics

df_long <- df %>% pivot_longer(
    # regex to match column names "se<digit>"
    cols = starts_with("se_asym"),
    names_to = "SE",
    values_to = "Value")

# Convert the "Coverage" column to a factor for better plotting
df_long$SE <- as.factor(df_long$SE)

# Plot
ggplot(df_long, aes(x = p, y = Value, color = SE)) +
  geom_point() +
  geom_line() +
  labs(x = "Bernoulli Probability (p)", y = "SE",
       title = "SE vs Bernoulli Probability", color = "SE")