knitr::opts_chunk$set(echo = TRUE)
library(algebraic.mle) library(dplyr) library(ggplot2) library(reshape2) library(knitr) library(md.tools) library(md.series.system) library(utils) library(tidyverse) library(gridExtra)
# this is the actual code i ran to generate the data set for # `exp_experiment_4.csv`. this was done before i generalized the approach. # i'm keeping all of the code the same since this is actually what was run # to generate the data set, but it should be the same as what i posted # below in block `run-exp-experiment-4` generate_data <- function(n, theta, tau, p) { m <- length(theta) comp_times <- matrix(nrow=n,ncol=m) for (j in 1:m) comp_times[,j] <- rexp(n,theta[j]) comp_times <- md_encode_matrix(comp_times,"t") comp_times %>% md_series_lifetime_right_censoring(tau) %>% md_bernoulli_cand_C1_C2_C3(p) %>% md_cand_sampler() } custom_solver <- function(data, theta, extra_info = NULL, annealing = TRUE) { ll <- md_loglike_exp_series_C1_C2_C3(data) ll.grad <- md_score_exp_series_C1_C2_C3(data) fish <- md_fim_exp_series_C1_C2_C3(data) ll.hess <- function(x) -fish(x) theta.hat <- NULL start <- NULL m <- length(theta) tryCatch({ start <- list(par = theta) if (annealing) { start <- sim_anneal(par = theta, fn = ll, control = list(fnscale = -1, maxit = 100000L, trace = FALSE, t_init = 20, 1e-3, alpha = 0.95, it_per_temp = 50L, neigh = function(par, temp, ...) { tt <- min(temp, 1) par + rnorm(m, 0, tt) }, sup = function(x) all(x > 0))) } res_optim <- optim( par = start$par, fn = ll, gr = ll.grad, method = "L-BFGS-B", lower = 1e-30, hessian = FALSE, control = list(fnscale = -1, maxit = 1000L)) theta.hat <- mle_numerical(res_optim, options = list(hessian = ll.hess(res_optim$par))) }, error = function(e) { cat("Sample size", nrow(data), " | Anneal:", start$par, " | ", e$message) if (!is.null(extra_info)) { cat(" | ", extra_info) } cat("\n") }) theta.hat } exp_experiment_4_gen <- function( csv_filename, R = 1000, bernoulli_probs = c(.2, .3, .4, .5, .6, .7, .8, .9), q = .25, sample_size = 100, append = TRUE, use_aneal_start = TRUE) { set.seed(32861) # set seed for reproducibility theta <- c(1, # component 1 failure rate 1.1, # 2 0.98, # 3 1.12, # 4 1.05) # 5 m <- length(theta) tau <- -(1/sum(theta))*log(q) if (!append) { cnames <- c("R", "p", "tau", "N", paste0("bias",1:m), "mse", paste0("se",1:m), paste0("se_asym",1:m), "mse_asym", "mse_asym_hat", paste0("coverage",1:m))## # Write column names first write.table(t(cnames), file = csv_filename, sep = ",", col.names = FALSE, row.names = FALSE, append = FALSE, quote = FALSE) } for (i in 1:length(bernoulli_probs)) { p <- bernoulli_probs[i] cat("Starting simulations for Bernoulli probability", p, "\n") mles <- matrix(nrow = R, ncol = m) # For storing the lower and upper bounds of the confidence intervals CI_lwr <- matrix(nrow = R, ncol = m) CI_upr <- matrix(nrow = R, ncol = m) j <- 1L repeat { data <- generate_data(sample_size, theta, tau, p) theta.hat <- custom_solver( data = data, theta = theta, extra_info = paste0("Replicate(", j, ")"), annealing = use_aneal_start) if (is.null(theta.hat)) { next } if (j %% 10 == 0) { cat("Sample size", sample_size, " | Replicate ", j, " | MLE ", point(theta.hat), "\n") } mles[j, ] <- point(theta.hat) CI <- confint(theta.hat) if (any(is.nan(CI))) { print("NaN in CI") print(summary(theta.hat)) } CI_lwr[j, ] <- CI[,1] CI_upr[j, ] <- CI[,2] j <- j + 1L if (j > R) { break } } # compute asymptotics theta.mle <- NULL while (is.null(theta.mle)) { data <- generate_data(sample_size, theta, tau, p) theta.mle <- custom_solver( data = data, theta = theta, extra_info = "asymptotics", annealing = use_aneal_start) } SE.asym <- se(theta.mle) MSE.asym <- mse(theta.hat, theta) MSE.asym.hat <- mse(theta.mle) # Calculate bias, MSE, and SE for each parameter bias <- colMeans(mles) - theta MSE <- sum(colMeans((mles - theta)^2)) sigma <- cov(mles) * ((sample_size - 1) / sample_size) SE <- sqrt(diag(sigma)) # Compute coverage probabilities coverage <- colMeans((CI_lwr <= theta) & (theta <= CI_upr)) datum <- c(R, p, tau, sample_size, bias, MSE, SE, SE.asym, MSE.asym, MSE.asym.hat, coverage) write.table(t(datum), file = csv_filename, sep = ",", col.names = FALSE, row.names = FALSE, append = TRUE) } } exp_experiment_4_gen( csv_filename = "exp_experiment_4.csv", append = TRUE, use_aneal_start = TRUE)
The purpose of this data set is to analyze the sensivity of the exponential series system (5 components, all roughly the same failure rate) with respect to a change in the Bernoulli probabilities for moderately sized sample sizes.
Here is how we generated this data set (we do not evaluate this code, since we already generated the data set and saved it to a file):
source("exp_experiment_gen.R") exp_experiment_gen( R = 1000, bernoulli_probs = c(.2, .3, .4, .5, .6, .7, .8, .9), quants = .25, sample_sizes = 100, theta = c(1, 1.1, 0.98, 1.12, 1.05), seed = 32861, use_aneal_start = TRUE, append = FALSE, csv_filename = "exp_experiment_4.csv")
We read the data set from the file with:
df <- read.csv("exp_experiment_4.csv") kable(df[1:5, 1:10]) kable(df[1:5, 11:20]) kable(df[1:5, 21:ncol(df)])
Let's visualize the coverage probabilities:
###################### coverage probability ################## df_long <- df %>% pivot_longer( cols = starts_with("coverage"), names_to = "Coverage", values_to = "Value") # Convert the "Coverage" column to a factor for better plotting df_long$Coverage <- as.factor(df_long$Coverage) # Plot the coverage probabilities, with a solid line for the # 95% confidence level ggplot(df_long, aes(x = p, y = Value, color = Coverage)) + geom_point() + geom_line() + geom_hline(yintercept = .95, linetype = "dashed") + labs(x = "Bernoulli Probability (p)", y = "Coverage Probability", title = "Coverage Probability vs Bernoulli Probability", color = "Coverage")
Analysis here
df_long <- df %>% pivot_longer( cols = starts_with("bias"), names_to = "Bias", values_to = "Value") # Convert the "Bias" column to a factor for better plotting df_long$Bias <- as.factor(df_long$Bias) # Plot, ggplot(df_long, aes(x = p, y = Value, color = Bias)) + geom_point() + geom_line() + labs(x = "Bernoulli Probability (p)", y = "Bias", title = "Bias vs Bernoulli Pobability", color = "Bias")
Analysis here
ggplot(df) + geom_point(aes(x = p, y = mse)) + geom_line(aes(x = p, y = mse, color = "Simulation")) + geom_line(aes(x = p, y = mse_asym, color = "Asymptotic"), linetype="dashed") + labs(x = "Bernoulli Probability (p)", y = "Mean Squared Error", title = "Mean Squared Error vs Bernoulli Probability")
Analysis here
df_long <- df %>% pivot_longer( # regex to match column names "se<digit>" cols = matches("se[0-9]"), names_to = "SE", values_to = "Value") # Convert the "Coverage" column to a factor for better plotting df_long$SE <- as.factor(df_long$SE) # Plot ggplot(df_long, aes(x = p, y = Value, color = SE)) + geom_point() + geom_line() + labs(x = "Bernoulli Probability (p)", y = "SE", title = "SE vs Bernoulli Probability", color = "SE")
Analysis here
df_long <- df %>% pivot_longer( # regex to match column names "se<digit>" cols = starts_with("se_asym"), names_to = "SE", values_to = "Value") # Convert the "Coverage" column to a factor for better plotting df_long$SE <- as.factor(df_long$SE) # Plot ggplot(df_long, aes(x = p, y = Value, color = SE)) + geom_point() + geom_line() + labs(x = "Bernoulli Probability (p)", y = "SE", title = "SE vs Bernoulli Probability", color = "SE")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.