demo/pmdata.R

library(NU.Learning)

# Use the <Particulate Matter> data of Obenchain and Young (2022).
data(pmdata)
# pmdata = read.csv(file="pmdata.csv")           # 2,980 observations on 122 Variables...
# outcome:    AACRmort ...continuous CDC measure of "Age Adjusted" Circulatory and/or Respiratory deaths per 100K US County residents.
# treatment:  Bvoc     ...continuous Exposure (Biogenic volatile organic compounds; micro g/m^3)
# str(pmdata, list.len=122)

data9 = na.omit(subset(pmdata, select = c(AACRmort, Bvoc, pmSO4, Avoc, PREMdeath, ASmoke, ChildPOV, IncomIEQ, pmSOA)))
str(data9)
# 'data.frame':   2973 obs. of  9 variables:
#  $ AACRmort : num  382 294 325 399 427 ...
#  $ Bvoc     : num  2.72 2.1 2.2 3.18 2.26 ...
#  $ pmSO4    : num  1.07 1.027 0.962 1.044 1.089 ...
#  $ Avoc     : num  1.41 1.31 1.32 1.4 1.53 ...
#  $ PREMdeath: num  9409 7468 8929 11742 9359 ...
#  $ ASmoke   : num  0.191 0.168 0.215 0.199 0.197 ...
#  $ ChildPOV : num  0.193 0.176 0.396 0.275 0.194 0.457 0.366 0.257 0.325 0.275 ...
#  $ IncomIEQ : num  4.39 4.6 5.86 4.23 4.07 ...
#  $ pmSOA    : num  4.13 3.41 3.53 4.58 3.79 ...
#  - attr(*, "na.action")= 'omit' Named int [1:7] 231 1563 1624 1669 2299 2301 2672
#   ..- attr(*, "names")= chr [1:7] "231" "1563" "1624" "1669" ...
# write.csv(data9, file = "data9.csv",row.names = FALSE)
  
rm(pmdata)

# Define Cluster Hierarchy for UNSUPERVISED, nonparametric analyses...
xvars  = c("Bvoc","Avoc","pmSO4","PREMdeath","ChildPOV","ASmoke")
hclobj = NUcluster(data9, xvars)       # default clustering method = "ward.D"
hclobj  
plot(hclobj)

# Save NU_Learn basic parameter settings to an environment that will be Updated...
e = NUsetup(hclobj, data9, Bvoc, AACRmort)
ls.str(e)

# Compute and Save LRC distributions for a range of K = Number of Clusters...
mort010 = lrcagg( 10, e)
mort025 = lrcagg( 25, e)
mort050 = lrcagg( 50, e)               # Average Cluster Size: ~53 US Counties
plot(mort050, show="ecdf", e)
mort075 = lrcagg( 75, e)               # Average Cluster Size: ~35 US Counties
mort100 = lrcagg(100, e)
mort200 = lrcagg(200, e)               # Average Cluster Size: Only ~13 US Counties

# "Sensitivity Analysis" Summary...
NUcompare(e)                 # LRC Distribution for 50 Clusters appears to Optimize Variance-Bias Trade-Offs... 

# Confirm: Does the Observed LRC distribution for 50 clusters truly differ from
# the Random Permutation NULL distribution assuming x_Covariates are Ignorable?
system.time( conf050 <- confirm(mort050) )       # Simulation takes ~ 5 seconds. 
conf050                                          # Print result summary...
plot(conf050)                                    # Compare Observed LRC [Blue] and "NULL" [Gray] eCDFs...

# Simulate maximum p-value for observed Kolmogorov-Smirnov D-statistic... 
system.time( ksd050 <- KSperm(conf050) )         # Simulation takes ~ 9 seconds for the Default number of reps=100 (replications)...   
ksd050                                           # Printed output and Plot imply this p-Value is at most 0.01...
plot(ksd050)                                     # That this p-Value is at most 0.001 can be shown using 1000 replications (takes ~ 82 secs)...

# Example: "Most-Like-Me" Visualizations for Residents of
#   "OC" = Orange County, California (fips=6059):
# xvars:   "Bvoc"   "Avoc"    "pmSO4" "PREMdeath" "ChildPOV" "ASmoke"
xvecOC = c(0.64748, 2.890584, 0.930111, 4094.363, 0.145, 0.099952)  
mlmeOC = mlme(e, hclobj, mort050, xvecOC )
mlmeOC                       # Implicit print...
plot(mlmeOC, NN = 25)        # "Most-Like-Me" plot for
# 25 "Nearest Neighbors" of... >>>> Orange County, CA. <<<<
# Summary Statistics...
mlme.stats(mlmeOC, NN = 25)

### End of demo(pmdata) ##############################################
 

Try the NU.Learning package in your browser

Any scripts or data that you put into this service are public.

NU.Learning documentation built on Oct. 1, 2023, 1:06 a.m.