library("rqdatatable") # requires version rqdatatable 1.1.2 or newer and rquery 1.2.1 or newer library("wrapr") library("cdata") library("ggplot2") library("glmnet") set.seed(2018) naive_sd_fun <- function(x) { sqrt(mean((mean(x)-x)^2)) } Bessel_sd_fun <- function(x) { sd(x) } eval_scale_adjustment_table_ratio <- function(n, p, sd_fun) { obs <- 0:n probs <- dbinom(obs, size = n, prob = p) evals <- vapply(obs, function(ki) { sd_fun(c(rep(1, ki), rep(0, n-ki))) }, numeric(1)) sd_target <- sqrt(p*(1-p)) sum(probs*evals)/sd_target } eval_scale_adjustment_table_diff <- function(n, p, sd_fun) { obs <- 0:n probs <- dbinom(obs, size = n, prob = p) evals <- vapply(obs, function(ki) { sd_fun(c(rep(1, ki), rep(0, n-ki))) }, numeric(1)) sd_target <- sqrt(p*(1-p)) sum(probs*evals)-sd_target } # related to Wald mini/max type ideas and: # http://www.win-vector.com/blog/2014/07/frequenstist-inference-only-seems-easy/ # https://mathoverflow.net/questions/177574/existence-of-solutions-of-a-polynomial-system # think this is under-determined, so could ask for symmetry or all coefs near 1. # also a lot like a Chebyshev polynomial (or integral of one), or upside down beta. # can also try priors like beta(0.5,0.5) # could also solve for affine version- where we allow addition in addition to scaling. # must have a really nice form if we integrate the ps-term (likely a calculus of variations solution). solve_for_scaling_table <- function(n, sd_fun = naive_sd_fun, intercept = TRUE, excess_resolution = 16) { if(n<2) { return(rep(1, n+1)) } obs <- 1:(n-1) vars <- paste0("s_", 1:(n-1)) ps <- seq(1/(excess_resolution*n), (excess_resolution*n-1)/(excess_resolution*n), by = 1/(excess_resolution*n)) d <- data.frame(target = sqrt(ps*(1-ps))) for(ki in seq_len(length(vars))) { k <- obs[[ki]] var <- vars[[ki]] d[[var]] <- 0 for(ii in seq_len(length(ps))) { pi <- ps[[ii]] prob <- dbinom(k, size = n, prob = pi) est <- sd_fun(c(rep(1, k), rep(0, n-k))) cij <- prob*est d[[var]][[ii]] <- cij d$target[[ii]] <- d$target[[ii]] - cij # regularize towards zero } } const <- 0.0 if(length(vars)<=1) { m <- lm(mk_formula("target", vars, intercept = intercept), data= d) soln <- as.numeric(m$coefficients[vars]) + 1 if(intercept) { const = m$coefficients[["(Intercept)"]] } } else { m <- glmnet(as.matrix(d[, vars, drop = FALSE]), d$target, alpha=0, lambda=1e-6, family = "gaussian", intercept = intercept, weights = dbeta(ps, 0.5, 0.5), # lower.limits = -10, standardize = FALSE) soln <- as.numeric(m$beta) + 1 if(intercept) { const = m$a0 } } multipliers = c(0, soln, 0) ests <- vapply(0:n, function(k) { sd_fun(c(rep(1, k), rep(0, n-k)))*multipliers[[k+1]] + const }, numeric(1)) function(x) { if(length(x)!=n) { stop("wrong length") } if(length(setdiff(x, c(0,1)))>0) { stop("non zero/one value") } k = sum(x) ests[[k+1]] } } plot_estimate_ratios <- function(n, ssoln) { adjs <- data.frame(p = seq(0, 1, by = 0.01)) adjs$joint_scaled <- vapply( adjs$p, function(pi) { eval_scale_adjustment_table_ratio(n, pi, sd_fun = ssoln) }, numeric(1)) adjs$Bessel_scaled <- vapply( adjs$p, function(pi) { eval_scale_adjustment_table_ratio(n, pi, sd_fun = Bessel_sd_fun) }, numeric(1)) adjs$unscaled <- vapply( adjs$p, function(pi) { eval_scale_adjustment_table_ratio(n, pi, sd_fun = naive_sd_fun) }, numeric(1)) adjsp <- unpivot_to_blocks( adjs, nameForNewKeyColumn = "method", nameForNewValueColumn = "ratio", columnsToTakeFrom = c("joint_scaled", "unscaled", "Bessel_scaled")) adjsp <- adjsp[!is.na(adjsp$ratio), , drop = FALSE] adjsp$method <- reorder(factor(adjsp$method), -adjsp$ratio) ggplot(data = adjsp, mapping = aes(x = p, y = ratio, color = method)) + geom_line() + geom_hline(yintercept = 1, color = "red", linetype=2) + scale_color_brewer(palette = "Dark2") + ggtitle("ratio betweeen estimated and theoretical standard deviations", subtitle = paste0("n= ", n)) } plot_estimate_diffs <- function(n, ssoln) { adjs <- data.frame(p = seq(0, 1, by = 0.01)) adjs$joint_scaled <- vapply( adjs$p, function(pi) { eval_scale_adjustment_table_diff(n, pi, sd_fun = ssoln) }, numeric(1)) adjs$Bessel_scaled <- vapply( adjs$p, function(pi) { eval_scale_adjustment_table_diff(n, pi, sd_fun = Bessel_sd_fun) }, numeric(1)) adjs$unscaled <- vapply( adjs$p, function(pi) { eval_scale_adjustment_table_diff(n, pi, sd_fun = naive_sd_fun) }, numeric(1)) adjsp <- unpivot_to_blocks( adjs, nameForNewKeyColumn = "method", nameForNewValueColumn = "difference", columnsToTakeFrom = c("joint_scaled", "unscaled", "Bessel_scaled")) adjsp <- adjsp[!is.na(adjsp$difference), , drop = FALSE] adjsp$method <- reorder(factor(adjsp$method), -adjsp$difference) ggplot(data = adjsp, mapping = aes(x = p, y = difference, color = method)) + geom_line() + geom_hline(yintercept = 0, color = "red", linetype=2) + scale_color_brewer(palette = "Dark2") + ggtitle("difference betweeen estimated and theoretical standard deviations", subtitle = paste0("n=", n)) } plot_estimates <- function(n, ssoln) { adjs <- data.frame(k = 0:n) adjs$joint_scaled <- vapply(0:n, function(k) { ssoln(c(rep(1, k), rep(0, n-k))) }, numeric(1)) adjs$Bessel_scaled <- vapply(0:n, function(k) { sd(c(rep(1, k), rep(0, n-k))) }, numeric(1)) adjs$naive_sd <- vapply(0:n, function(k) { naive_sd_fun(c(rep(1, k), rep(0, n-k))) }, numeric(1)) adjp <- cdata::unpivot_to_blocks(adjs, nameForNewKeyColumn = "estimation_method", nameForNewValueColumn = "estimate", columnsToTakeFrom = qc(joint_scaled, Bessel_scaled, naive_sd)) ggplot(data = adjp, mapping = aes(x = k, y = estimate, color = estimation_method, group = estimation_method)) + geom_point() + geom_linerange(aes(ymin = 0, ymax = estimate)) + scale_color_brewer(palette = "Dark2") + facet_wrap(~estimation_method, ncol=1) + ggtitle("standard deviation estimates", subtitle = paste0("n=", n)) } cl <- parallel::makeCluster(parallel::detectCores()) intercept <- TRUE print(paste("*******", intercept)) for(n in c(2, 3, 4, 5, 10, 20, 100)) { print("******") print(n) tab <- solve_for_scaling_table(n, intercept = intercept) plot_estimates(n, tab) %.>% print(.) plot_estimate_ratios(n, tab) %.>% print(.) plot_estimate_diffs(n, tab) %.>% print(.) } data <- as.data.frame(Titanic) data <- data %.>% select_rows(., (Class == "Crew") & (Sex == "Female") & (Age == "Adult")) %.>% orderby(., "Survived") print(data) universe <- c(rep(0, data$Freq[data$Survived=="No"]), rep(1, data$Freq[data$Survived=="Yes"])) print(universe) summary1 <- function(x, tabu) { scale_corrected_sd <- tabu(x) naive_var <- mean((mean(x)-x)^2) sd_est <- sd(x) # https://en.wikipedia.org/wiki/Unbiased_estimation_of_standard_deviation # correctly correct for normal, not binomomial as we have here! data.frame(mean = mean(x), var = var(x), sd = sd_est, naive_var = naive_var, naive_sd = sqrt(naive_var), scale_corrected_sd = scale_corrected_sd) } samp_size <- 5 mk_f <- function(universe, samp_size, summary1, intercept) { force(universe) force(samp_size) force(summary1) force(intercept) scale_adjustment_table <- solve_for_scaling_table(samp_size, naive_sd_fun, intercept = intercept) f <- function(i) { sample <- universe[sample.int(length(universe), samp_size, replace = TRUE)] summary1(sample, scale_adjustment_table) } } f <- mk_f(universe, samp_size, summary1, intercept) print("universe") tabu <- solve_for_scaling_table(length(universe), naive_sd_fun, intercept = intercept) su <- summary1(universe, tabu) n <- length(universe) print("actual sd (non-sample)") print(su) Bessel_corrected_var <- (n/(n-1))*su$naive_var print(Bessel_corrected_var) Bessel_corrected_sd <- sqrt(Bessel_corrected_var) print(Bessel_corrected_sd) res <- parallel::parLapply(cl, 1:100000, f) res <- do.call(rbind, res) sums <- as.data.frame(lapply(res, mean)) res$Bessel_sd <- res$sd res$Bessel_var <- res$var rpv <- res %.>% extend(., Bessel_var := Bessel_sd^2) %.>% unpivot_to_blocks(., nameForNewKeyColumn = "estimation_method", nameForNewValueColumn = "var_estimate", columnsToTakeFrom = c("naive_var", "Bessel_var")) efv <- project(rpv, var_estimate = mean(var_estimate), groupby = "estimation_method") rpv %.>% project(., variance_of_estimate = var(var_estimate), groupby = "estimation_method") p3v <- ggplot(data = rpv, aes(x = var_estimate)) + geom_histogram(bins = 30) + geom_vline(xintercept = su$naive_var, color = "red", alpha = 0.5, size=3, linetype =3) + geom_vline(xintercept = su$naive_var, color = "red", alpha = 0.5) + geom_vline(data = efv, aes(xintercept = var_estimate), linetype=2) + facet_wrap(~estimation_method, ncol=1) + ggtitle("distribution of variance estimates by method", subtitle = "average shown in black, original universe variance value in red") print(p3v) rp <- unpivot_to_blocks(res, nameForNewKeyColumn = "estimation_method", nameForNewValueColumn = "sd_estimate", columnsToTakeFrom = c( "scale_corrected_sd", "naive_sd", "Bessel_sd")) ef <- project(rp, sd_estimate = mean(sd_estimate), groupby = "estimation_method") rp %.>% project(., variance_of_estimate = var(sd_estimate), groupby = "estimation_method") rpb <- rp[rp$estimation_method %in% qc(Bessel_sd, naive_sd), , drop = FALSE] efb <- ef[ef$estimation_method %in% qc(Bessel_sd, naive_sd), , drop = FALSE] p3b <- ggplot(data = rpb, aes(x = sd_estimate)) + geom_histogram(bins = 30) + geom_vline(xintercept = su$naive_sd, color = "red", alpha = 0.5, size=2, linetype =3) + geom_vline(xintercept = su$naive_sd, color = "red", alpha = 0.5) + geom_vline(data = efb, aes(xintercept = sd_estimate), linetype=2) + facet_wrap(~estimation_method, ncol=1) + ggtitle("distribution of sd estimates by method", subtitle = "average shown in black, original universe sd value in red") print(p3b) rpa <- rp[rp$estimation_method %in% qc(Bessel_sd, naive_sd, scale_corrected_sd), , drop = FALSE] efa <- ef[ef$estimation_method %in% qc(Bessel_sd, naive_sd, scale_corrected_sd), , drop = FALSE] p3 <- ggplot(data = rpa, aes(x = sd_estimate)) + geom_histogram(bins = 30) + geom_vline(xintercept = su$naive_sd, color = "red", alpha = 0.5, size=2, linetype =3) + geom_vline(xintercept = su$naive_sd, color = "red", alpha = 0.5) + geom_vline(data = efa, aes(xintercept = sd_estimate), linetype=2) + facet_wrap(~estimation_method, ncol=1) + ggtitle("distribution of sd estimates by method", subtitle = "average shown in black, original universe sd value in red") print(p3) rp %.>% project(., variance_of_estimate = var(sd_estimate), groupby = "estimation_method") parallel::stopCluster(cl)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.