#R Log to fill Table Shells using small practice runs of 20 reps for each of 12 workloads
#========================================================================================
#4/19/2022
setwd("c://EricStf/CensusProj/DiffPrivacy")
attach("bottomUp-forEric-4-13.RData")
objects(2)
# [1] "A" "R" ## A is 2x2x7x3x2x43 = 7224 elements, R is 7224x12x20
## array dimensions: Own, Sex, Race, Age, Hisp, Tract
## noise added should be same in each interior cell and margin when new workload is added
## Confirm indexing of marginals with James on morning of 4/19.
## Need to create new tables B for County and Carr for State to index later workloads.
## tracts break down into 7 counties successively in blocks c(8,9,11,6,2,3,4)
csum = cumsum(0,c(8,9,11,6,2,3,4))
> B = array(0, c(2,2,7,3,2,7), dimnames=c(dimnames(A)[1:5],list(paste0("C",1:7))))
dimnames(B)
[[1]]
[1] "own" "rnt"
[[2]]
[1] "mal" "fem"
[[3]]
[1] "wh" "bl" "as" "aian" "pac" "oth" "twp"
[[4]]
[1] "0-17" "18-62" "62p"
[[5]]
[1] "hisp" "nonhisp"
[[6]]
[1] "C1" "C2" "C3" "C4" "C5" "C6" "C7" ## OK
> for(i in 1:7) B[,,,,,i] = apply(A,c(1:5), function(vec)
sum(vec[(csum[i]+1):csum[i+1]]) )
## Now can do a similar thing for "State" rollup
## For example, assuming the first 3 counties are ST1,, and the next 4 ST2:
> Carr = array(0, c(2,2,7,3,2,2), dimnames=c(dimnames(A)[1:5],list(c("ST1","ST2"))))
> dimnames(Carr)
[[1]]
[1] "own" "rnt"
[[2]]
[1] "mal" "fem"
[[3]]
[1] "wh" "bl" "as" "aian" "pac" "oth" "twp"
[[4]]
[1] "0-17" "18-62" "62p"
[[5]]
[1] "hisp" "nonhisp"
[[6]]
[1] "ST1" "ST2"
> ssum = c(0,3,7)
for(i in 1:2) Carr[,,,,,i] = apply(B,c(1:5), function(vec)
sum(vec[(ssum[i]+1):ssum[i+1]]) )
## NB in bottom-up workloads 3, 6, 9, table is based on VotingAge
## which is a tabulation of just 2 out of the 3 age-categories.
## So we make 3 other tables with suffix V collapsing 2 age-levels.
> AV = A[,,,1:2,,]
BV = B[,,,1:2,,]
CV = Carr[,,,1:2,,]
dimnames(AV)[[4]][2] = dimnames(BV)[[4]][2] = dimnames(CV)[[4]][2] = "18p"
AV[,,,2,,] = A[,,,2,,] + A[,,,3,,]
BV[,,,2,,] = B[,,,2,,] + B[,,,3,,]
CV[,,,2,,] = Carr[,,,2,,] + Carr[,,,3,,]
#===========================================================
# Next get ready to tally Sim Results according to size classes of cells
# Since A is the array with "true" cell counts, we create size classes.
## 144480 = pop total, in 7224 cells of average size 20, 3630 empty (about half)
> c(sum(c(A)==0), sum(c(A)<=5), sum(c(A)<=15), sum(c(A)<=30), sum(c(A)<=100), sum(c(A)> 100))
[1] 3630 5476 6103 6317 6721 503
> SizGp = cut(c(A), breaks=c(0,1,6,16,51,151,1001,10000, 2e5), right=F)
> table(SizGp)
SizGp
[0,1) [1,6) [6,16) [16,51) [51,151) [151,2e+05) [1e+03,1e+04) [1e+04,2e+05)
3630 1846 627 340 406 375 0 0
## Now want to indicate how the various marginal cells [used in workloads]
## fall within these size classifications
## the marginals for the Sims have the following dimension and array combinations
Sim Array Dimensions
--- ----- ----------
1 A 1:6
2 A 1,6
3 AV 3:6
4 A 2,4,6
5 B 1,6
6 BV 3:6
7 B 2,4,6
8 Carr 1,6
9 CV 3:6
10 Carr 2,4,6
11 B 1:6
12 Carr 1:6
ArrList = c("A","A","AV","A","B","BV","B","Carr","CV","Carr","B","Carr")
DimList = list(1:6,c(1,6),c(3:6),c(2,4,6),c(1,6),c(3:6),c(2,4,6),
c(1,6),c(3:6),c(2,4,6),1:6,1:6)
## Now we can create a little Table showing the numbers of different levels
## of margin-cells in the different size categories
> SizCat.Tbl = array(0, c(12,8), dimnames=list(paste0("Sim",1:12),
levels(SizGp)))
for(i in 1:12) SizCat.Tbl[i,] = table(
cut(apply(get(ArrList[i]),DimList[[i]],sum),
breaks=c(0,1,6,16,51,151,1001,10001,2e5), right=F))
> SizCat.Tbl
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03) [1e+03,1e+04) [1e+04,2e+05)
Sim1 3630 1846 627 340 406 375 0 0
Sim2 0 0 0 0 0 12 74 0
Sim3 768 150 70 75 57 49 35 0
Sim4 0 0 0 0 0 255 3 0
Sim5 0 0 0 0 0 0 8 6
Sim6 109 15 13 14 12 20 10 3
Sim7 0 0 0 0 0 0 42 0
Sim8 0 0 0 0 0 0 0 4
Sim9 29 1 5 2 6 6 4 3
Sim10 0 0 0 0 0 0 6 6
Sim11 302 327 129 159 106 105 48 0
Sim12 43 70 64 15 71 37 36 0
## Now consider the coding/functions to fill table shells.
# (T1) mean-squared discrepancy over all interior cells:
round(apply((R - c(A))^2,2,mean),3)
sim2 sim3 sim4 sim5 sim6 sim7 sim8 sim9 sim10 sim11 sim12
5.421 5.034 4.792 5.115 4.838 4.742 4.785 4.584 4.081 4.203 3.781
## seems convincingly to be trending downward, although not with absolute discrepancy:
> round(apply(abs(R - c(A)),2,mean),3)
sim2 sim3 sim4 sim5 sim6 sim7 sim8 sim9 sim10 sim11 sim12
1.205 1.218 1.220 1.275 1.274 1.274 1.321 1.313 1.305 1.403 1.349
# (T2) mean-squared discrepancy over interior cells, by size class
T2tab = array(0, c(11,6), dimnames=list(paste0("Sim",2:12), levels(SizGp)[1:6]))
> for(i in 1:6) {
inds = which(as.numeric(SizGp)==i)
T2tab[,i] = apply((R[inds,,]-c(A)[inds])^2,2,mean) }
> round(T2tab,3)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03)
Sim2 4.786 5.009 3.307 12.661 12.484 2.919
Sim3 4.843 4.110 4.786 8.392 10.471 2.919
Sim4 4.511 4.514 3.218 5.831 10.205 4.718
Sim5 5.148 4.994 5.052 5.663 4.548 5.625
Sim6 4.960 4.753 4.112 5.257 4.730 5.024
Sim7 4.787 4.634 3.966 5.100 5.283 5.220
Sim8 4.705 4.718 4.337 5.336 5.591 5.262
Sim9 4.525 4.263 4.772 5.405 5.172 5.041
Sim10 4.021 3.906 4.484 4.036 4.512 4.425
Sim11 4.099 4.076 4.412 4.435 4.626 4.815
Sim12 3.641 3.748 4.088 3.960 4.173 4.208
## Interesting differences between the trend across sims based on size of cell.
T2abs = replace(T2tab,T,0)
for(i in 1:6) {
inds = which(as.numeric(SizGp)==i)
T2abs[,i] = apply(abs(R[inds,,]-c(A)[inds]),2,mean) }
round(T2abs,3)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03)
Sim2 1.195 1.203 1.196 1.271 1.256 1.209
Sim3 1.210 1.210 1.234 1.258 1.285 1.209
Sim4 1.210 1.215 1.205 1.245 1.326 1.236
Sim5 1.275 1.266 1.280 1.306 1.251 1.313
Sim6 1.279 1.270 1.245 1.293 1.263 1.294
Sim7 1.277 1.269 1.247 1.294 1.281 1.292
Sim8 1.316 1.310 1.307 1.362 1.368 1.356
Sim9 1.311 1.286 1.328 1.372 1.342 1.354
Sim10 1.301 1.287 1.341 1.311 1.324 1.345
Sim11 1.397 1.375 1.429 1.438 1.441 1.483
Sim12 1.332 1.336 1.402 1.383 1.401 1.411
T2max = replace(T2tab,T,0)
for(i in 1:6) {
inds = which(as.numeric(SizGp)==i)
T2max[,i] = apply(abs(R[inds,,]-c(A)[inds]),2,max) }
round(T2max,3)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03)
Sim2 166.572 218.298 76.335 152.030 165.005 12.837
Sim3 120.107 119.369 66.004 108.910 94.163 12.837
Sim4 109.871 119.369 35.911 84.987 82.948 89.224
Sim5 70.220 55.253 59.150 52.733 51.153 54.442
Sim6 62.502 46.213 45.839 43.806 54.691 55.926
Sim7 66.157 49.458 38.268 51.388 55.030 59.506
Sim8 55.087 41.538 30.501 31.385 34.656 30.593
Sim9 38.922 39.587 30.423 35.193 38.651 30.958
Sim10 30.360 27.169 25.397 22.817 26.928 24.993
Sim11 24.350 16.538 18.881 16.687 15.989 17.731
Sim12 22.854 18.173 18.261 12.929 12.319 15.199
# Look at biases:
> round(apply(R - c(A),2,mean),6)
sim2 sim3 sim4 sim5 sim6 sim7 sim8 sim9 sim10 sim11 sim12
2.1e-05 8.7e-05 5.2e-05 5.2e-05 5.6e-05 3.1e-05 3.0e-05 2.4e-05 -3.8e-05 -4.4e-05 -3.5e-05
T2bias = replace(T2tab,T,0)
for(i in 1:6) {
inds = which(as.numeric(SizGp)==i)
T2bias[,i] = apply(R[inds,,]-c(A)[inds],2,mean) }
round(T2bias,4)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03)
Sim2 0.0062 0.0069 0.0347 -0.0608 -0.0965 0.0076
Sim3 0.0197 -0.0014 -0.0052 -0.0483 -0.1269 0.0076
Sim4 0.0198 -0.0019 0.0235 -0.0363 -0.1549 -0.0204
Sim5 0.0152 -0.0181 -0.0084 -0.0391 -0.0253 0.0197
Sim6 0.0168 -0.0211 -0.0033 -0.0248 -0.0315 0.0040
Sim7 0.0127 -0.0217 0.0130 -0.0086 -0.0342 0.0076
Sim8 0.0225 -0.0296 0.0013 -0.0437 -0.0406 0.0094
Sim9 0.0196 -0.0164 -0.0256 -0.0472 -0.0293 0.0088
Sim10 0.0101 -0.0198 0.0029 0.0159 -0.0237 0.0052
Sim11 0.0075 -0.0197 0.0185 -0.0112 0.0022 0.0004
Sim12 0.0113 -0.0253 0.0133 0.0019 -0.0071 -0.0017
## This Table says to me that bias is completely not an issue when
# the released data is continuous (and not sign-restricted), so (virtually)
# all biases are due to postptocessing.
> apply(R, 2, function(col) sum(col < 0)) /20
sim2 sim3 sim4 sim5 sim6 sim7 sim8 sim9 sim10 sim11 sim12
2053.05 2052.00 2051.35 2046.15 2045.50 2045.70 2043.30 2041.35 2051.30 2059.70 2054.45
> round( apply(R, 2, function(col) -mean(col[col < 0])), 3)
sim2 sim3 sim4 sim5 sim6 sim7 sim8 sim9 sim10 sim11 sim12
1.203 1.213 1.208 1.307 1.309 1.306 1.349 1.338 1.325 1.423 1.349
#================================================================
# (T3) finish off by looking at some margins.
# For example let's look at the Sim3 (Race x VA x Hisp x Tract) margins through
# all Sim3 - Sim12, overall and by size-class
# loop over Sim3-Sim12 using index 3:12 in ArrList and DimList:
> summary(c(apply(AV, 3:6, sum)))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 3.00 120.00 29.25 3437.00 ## 1204 cells, avg size 120
auxGp = cut(apply(AV, 3:6, sum), breaks=c(0,1,6,16,51,151,1001,10000), right=F)
table(auxGp)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03) [1e+03,1e+04)
384 304 141 133 87 111 44
## Overall mean-square discrepancy:
Rtmp = array(R, c(2,2,7,3,2,43,11,20))
Rtmp = apply(Rtmp, 3:8, sum)
Rtmp[,2,,,,] = Rtmp[,2,,,,]+Rtmp[,3,,,,]
Rtmp = array( Rtmp[,1:2,,,2:11,], c(1204,10,20) )
round(apply( (Rtmp-c(apply(AV, 3:6, sum)))^2, 2, mean ), 3)
[1] 30.204 28.911 29.459 27.629 27.332 24.937 24.210 20.049 1.867 1.757
# dramatic change quite close to the end, in Sim11: why ?? look by size class
> T3tab = array(0,c(10,7), dimnames=list(paste0("Sim",3:12), levels(auxGp)))
for(i in 1:7) {
inds = which(as.numeric(auxGp)==i)
T3tab[,i] = apply((Rtmp[inds,,]-apply(AV, 3:6, sum)[inds])^2,2,mean) }
> round(T3tab,3)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03) [1e+03,1e+04)
Sim3 16.059 43.084 24.507 26.668 32.428 53.443 30.572
Sim4 21.575 36.064 27.864 21.438 25.016 40.437 48.080
Sim5 25.004 32.636 31.073 31.413 30.898 26.921 38.861
Sim6 25.133 27.493 31.785 30.621 27.499 25.891 32.623
Sim7 25.314 27.189 29.792 28.014 26.062 27.852 37.182
Sim8 23.181 24.907 24.390 25.738 25.023 27.111 34.147
Sim9 22.595 23.378 24.135 24.877 25.137 27.208 32.890
Sim10 18.920 19.580 19.353 22.574 20.437 20.180 26.653
Sim11 1.846 1.855 1.798 1.911 1.935 1.911 1.980
Sim12 1.784 1.753 1.760 1.753 1.723 1.647 1.904
## across all size classes: a big change in these margins in Sim 11 !!
## why should county marginals make such a big difference in these tract-level marginals??
> T3abs = array(0,c(10,7), dimnames=list(paste0("Sim",3:12), levels(auxGp)))
for(i in 1:7) {
inds = which(as.numeric(auxGp)==i)
T3abs[,i] = apply(abs(Rtmp[inds,,]-apply(AV, 3:6, sum)[inds]),2,mean) }
round(T3abs,3)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03) [1e+03,1e+04)
Sim3 3.082 3.556 3.208 3.500 3.437 3.559 3.557
Sim4 3.164 3.482 3.276 3.399 3.314 3.540 3.893
Sim5 3.429 3.711 3.626 3.735 3.627 3.497 4.037
Sim6 3.454 3.628 3.635 3.752 3.522 3.486 3.832
Sim7 3.465 3.633 3.559 3.661 3.535 3.529 3.942
Sim8 3.477 3.623 3.523 3.680 3.631 3.712 4.072
Sim9 3.460 3.536 3.506 3.620 3.600 3.715 4.036
Sim10 3.294 3.365 3.284 3.530 3.432 3.360 3.817
Sim11 0.982 0.978 0.969 0.994 0.994 1.003 1.011
Sim12 0.961 0.958 0.975 0.959 0.964 0.939 0.984
> T3max = array(0,c(10,7), dimnames=list(paste0("Sim",3:12), levels(auxGp)))
for(i in 1:7) {
inds = which(as.numeric(auxGp)==i)
T3max[,i] = apply(abs(Rtmp[inds,,]-apply(AV, 3:6, sum)[inds]),2,max) }
round(T3max,3)
[0,1) [1,6) [6,16) [16,51) [51,151) [151,1e+03) [1e+03,1e+04)
Sim3 24.806 123.630 118.500 63.570 77.686 113.949 96.061
Sim4 115.521 115.940 118.500 44.371 79.840 78.893 84.021
Sim5 44.719 76.395 48.859 58.562 48.317 46.424 54.329
Sim6 49.481 47.587 60.871 46.828 46.331 51.978 54.247
Sim7 46.187 50.244 65.666 41.018 45.543 51.461 56.296
Sim8 35.754 44.754 43.432 32.179 32.238 31.943 31.250
Sim9 41.198 36.076 35.262 28.772 31.442 32.776 31.039
Sim10 24.097 22.308 22.278 30.166 26.360 27.077 24.962
Sim11 6.917 9.968 6.300 7.357 7.402 6.777 7.030
Sim12 9.649 8.522 6.379 7.877 5.672 7.054 6.243
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.