# hermiter" In hermiter: Efficient Sequential and Batch Estimation of Univariate and Bivariate Probability Density Functions and Cumulative Distribution Functions along with Quantiles (Univariate) and Spearman's Correlation (Bivariate)

mean_abs_error_quantiles=quantile_vals$mean_abs_error_quantiles)  datatable(mean_abs_error_summary) %>% formatRound(columns =c("mean_abs_error_density","mean_abs_error_cum_prob", "mean_abs_error_quantiles"),digits = 3)  # Applying to stationary data (sequential setting) ## Univariate Example Another useful application of the hermite_estimator class is to obtain pdf, cdf and quantile function estimates on streaming data. The speed of estimation allows the pdf, cdf and quantile functions to be estimated in real time. We illustrate this below for cdf and quantile estimation with a sample Shiny application. We reiterate that the particular usefulness is that the full pdf, cdf and quantile functions are updated in real time. Thus, any arbitrary quantile can be evaluated at any point in time. We include a stub for reading streaming data that generates micro-batches of standard exponential i.i.d. random data. This stub can easily be swapped out for a method reading micro-batches from a Kafka topic or similar. The Shiny sample code below can be pasted into a single app.R file and run directly. # Not Run. Copy and paste into app.R and run. library(shiny) library(hermiter) library(ggplot2) library(magrittr) ui <- fluidPage( titlePanel("Streaming Statistics Analysis Example: Exponential i.i.d. stream"), sidebarLayout( sidebarPanel( sliderInput("percentile", "Percentile:", min = 0.01, max = 0.99, value = 0.5, step = 0.01) ), mainPanel( plotOutput("plot"), textOutput("quantile_text") ) ) ) server <- function(input, output) { values <- reactiveValues(hermite_est = hermite_estimator(N = 10, standardize = TRUE)) x <- seq(-15, 15, 0.1) # Note that the stub below could be replaced with code that reads streaming # data from various sources, Kafka etc. read_stream_stub_micro_batch <- reactive({ invalidateLater(1000) new_observation <- rexp(10) return(new_observation) }) updated_cdf_calc <- reactive({ micro_batch <- read_stream_stub_micro_batch() for (idx in seq_along(micro_batch)) { values[["hermite_est"]] <- isolate(values[["hermite_est"]]) %>% update_sequential(micro_batch[idx]) } cdf_est <- isolate(values[["hermite_est"]]) %>% cum_prob(x, clipped = TRUE) df_cdf <- data.frame(x, cdf_est) return(df_cdf) }) updated_quantile_calc <- reactive({ values[["hermite_est"]] %>% quant(input$percentile)
})
output$plot <- renderPlot({ ggplot(updated_cdf_calc(), aes(x = x)) + geom_line(aes(y = cdf_est)) + ylab("Cumulative Probability") } ) output$quantile_text <- renderText({
return(paste(input$percentile * 100, "th Percentile:", round(updated_quantile_calc(), 2))) }) } shinyApp(ui = ui, server = server) # Applying to non-stationary data (sequential setting) ## Univariate Example The hermite_estimator is also applicable to non-stationary data streams. A weighted form of the Hermite series based estimator can be applied to handle this case. The estimator will adapt to the new distribution and "forget" the old distribution as illustrated in the example below. In this univariate example, the distribution from which the observations are drawn switches from a Chi-square distribution to a logistic distribution and finally to a normal distribution. In order to use the exponentially weighted form of the hermite_estimator, the exp_weight_lambda argument must be set to a non-NA value. Typical values for this parameter are 0.01, 0.05 and 0.1. The lower the exponential weighting parameter, the slower the estimator adapts and vice versa for higher values of the parameter. However, variance increases with higher values of exp_weight_lambda, so there is a trade-off to bear in mind. # Prepare Test Data num_obs <-2000 test <- rchisq(num_obs,5) test <- c(test,rlogis(num_obs)) test <- c(test,rnorm(num_obs))  # Calculate theoretical pdf, cdf and quantile values for comparison x <- seq(-15,15,by=0.1) actual_pdf_lognorm <- dchisq(x,5) actual_pdf_logis <- dlogis(x) actual_pdf_norm <- dnorm(x) actual_cdf_lognorm <- pchisq(x,5) actual_cdf_logis <- plogis(x) actual_cdf_norm <- pnorm(x) p <- seq(0.05,0.95,by=0.05) actual_quantiles_lognorm <- qchisq(p,5) actual_quantiles_logis <- qlogis(p) actual_quantiles_norm <- qnorm(p)  # Construct Hermite Estimator h_est <- hermite_estimator(N=20,standardize = TRUE,exp_weight_lambda = 0.005)  # Loop through test data and update h_est to simulate observations arriving # sequentially count <- 1 res <- data.frame() res_q <- data.frame() for (idx in seq_along(test)) { h_est <- h_est %>% update_sequential(test[idx]) if (idx %% 100 == 0){ if (floor(idx/num_obs)==0){ actual_cdf_vals <- actual_cdf_lognorm actual_pdf_vals <-actual_pdf_lognorm actual_quantile_vals <- actual_quantiles_lognorm } if (floor(idx/num_obs)==1){ actual_cdf_vals <- actual_cdf_logis actual_pdf_vals <-actual_pdf_logis actual_quantile_vals <- actual_quantiles_logis } if (floor(idx/num_obs)==2){ actual_cdf_vals <- actual_cdf_norm actual_pdf_vals <- actual_pdf_norm actual_quantile_vals <- actual_quantiles_norm } idx_vals <- rep(count,length(x)) cdf_est_vals <- h_est %>% cum_prob(x, clipped=TRUE) pdf_est_vals <- h_est %>% dens(x, clipped=TRUE) quantile_est_vals <- h_est %>% quant(p) res <- rbind(res,data.frame(idx_vals,x,cdf_est_vals,actual_cdf_vals, pdf_est_vals,actual_pdf_vals)) res_q <- rbind(res_q,data.frame(idx_vals=rep(count,length(p)),p, quantile_est_vals,actual_quantile_vals)) count <- count +1 } } res <- res %>% mutate(idx_vals=idx_vals*100) res_q <- res_q %>% mutate(idx_vals=idx_vals*100)  # Visualize Results for PDF (Not run, requires gganimate, gifski and transformr # packages) p <- ggplot(res,aes(x=x)) + geom_line(aes(y=pdf_est_vals, colour="Estimated")) + geom_line(aes(y=actual_pdf_vals, colour="Actual")) + scale_colour_manual("", breaks = c("Estimated", "Actual"), values = c("blue", "black")) + ylab("Probability Density") +transition_states(idx_vals,transition_length = 2,state_length = 1) + ggtitle('Observation index {closest_state}') anim_save("pdf.gif",p) # Visualize Results for CDF (Not run, requires gganimate, gifski and transformr # packages) p <- ggplot(res,aes(x=x)) + geom_line(aes(y=cdf_est_vals, colour="Estimated")) + geom_line(aes(y=actual_cdf_vals, colour="Actual")) + scale_colour_manual("", breaks = c("Estimated", "Actual"), values = c("blue", "black")) + ylab("Cumulative Probability") + transition_states(idx_vals, transition_length = 2,state_length = 1) + ggtitle('Observation index {closest_state}') anim_save("cdf.gif", p) # Visualize Results for Quantiles (Not run, requires gganimate, gifski and # transformr packages) p <- ggplot(res_q,aes(x=actual_quantile_vals)) + geom_point(aes(y=quantile_est_vals), color="blue") + geom_abline(slope=1,intercept = 0) +xlab("Theoretical Quantiles") + ylab("Estimated Quantiles") + transition_states(idx_vals,transition_length = 2, state_length = 1) + ggtitle('Observation index {closest_state}') anim_save("quant.gif",p) ## Bivariate Example We illustrate tracking a non-stationary bivariate data stream with another sample Shiny application. The bivariate Hermite estimator leverages an exponential weighting scheme as described in the univariate case and does not need to maintain a sliding window. We include a stub for reading streaming data that generates micro-batches of bivariate normal i.i.d. random data with a chosen Spearman's correlation coefficient (as this is easily linked to the standard correlation matrix). This stub can again be readily swapped out for a method reading micro-batches from a Kafka topic or similar. The Shiny sample code below can be pasted into a single app.R file and run directly. # Not Run. Copy and paste into app.R and run. library(shiny) library(hermiter) library(ggplot2) library(magrittr) ui <- fluidPage( titlePanel("Bivariate Streaming Statistics Analysis Example"), sidebarLayout( sidebarPanel( sliderInput("spearmans", "True Spearman's Correlation:", min = -0.9, max = 0.9, value = 0, step = 0.1) ), mainPanel( plotOutput("plot"), textOutput("spearman_text") ) ) ) server <- function(input, output) { values <- reactiveValues(hermite_est = hermite_estimator(N = 10, standardize = TRUE, exp_weight_lambda = 0.01, est_type="bivariate")) # Note that the stub below could be replaced with code that reads streaming # data from various sources, Kafka etc. read_stream_stub_micro_batch <- reactive({ invalidateLater(1000) sig_x <- 1 sig_y <- 1 num_obs <- 100 rho <- 2 *sin(pi/6 * input$spearmans)
observations_mat <- mvtnorm::rmvnorm(n=num_obs,mean=rep(0,2),sigma = matrix(c(sig_x^2,rho*sig_x*sig_y,rho*sig_x*sig_y,sig_y^2), nrow=2,ncol=2,
byrow = TRUE))
return(observations_mat)
})
updated_spear_calc <- reactive({
micro_batch <- read_stream_stub_micro_batch()
for (idx in seq_len(nrow(micro_batch))) {
values[["hermite_est"]] <- isolate(values[["hermite_est"]]) %>%
update_sequential(micro_batch[idx,])
}
spear_est <- isolate(values[["hermite_est"]]) %>%
spearmans(clipped = TRUE)
return(spear_est)
})
output$plot <- renderPlot({ vals <- seq(-5,5,by=0.25) x_grid <- as.matrix(expand.grid(X=vals, Y=vals)) rho <- 2 *sin(pi/6 * input$spearmans)
actual_pdf <-mvtnorm::dmvnorm(x_grid,mean=rep(0,2),sigma = matrix(c(sig_x^2,rho*sig_x*sig_y,rho*sig_x*sig_y,sig_y^2), nrow=2,ncol=2,
byrow = TRUE))
df_pdf <- data.frame(x_grid,actual_pdf)
p1 <- ggplot(df_pdf) + geom_tile(aes(X, Y, fill= actual_pdf)) +
scale_fill_gradient2(low="blue", mid="cyan", high="purple",
midpoint=.2,
breaks=seq(0,.4,by=.1),
limits=c(0,.4)) +ggtitle(paste("True Bivariate
Normal Density with matched Spearman's correlation")) +
theme(legend.title = element_blank())
p1
}
)
output\$spearman_text <- renderText({
return(paste("Spearman's Correlation Estimate from Hermite Estimator:",
round(updated_spear_calc(), 1)))
})
}
shinyApp(ui = ui, server = server) # Citing this package

Use the following code to automatically generate an appropriate BibTex entry.

citation("hermiter")


## Try the hermiter package in your browser

Any scripts or data that you put into this service are public.

hermiter documentation built on Feb. 17, 2021, 9:06 a.m.