gcipdr-package: Gaussian Copula (based) Individual Person Data (IPD)...

Description Details See Also Examples

Description

The proposed Gaussian Copula technique generates pseudodata replacing IPD, using simple summaries, like IPD empirical marginal moments and correlation matrix, as only input data. In contexts where IPD is not available, pseudodata is a mean to compute approximate IPD inferences. The approach has applications in statistical disclosure control, distributed computing (e.g. see DataShield – see https://cran.datashield.org/), and, ideally research synthesis or reproduction.

Details

This package is a beta version and extensive editing, documenting, and amelioration shall be expected in the future.

See Also

https://github.com/bonorico/gcipdr

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# eventually set global parallelization options with 'options(mc.cores='your numebr of cores')'

##### LEVEL 1: RAW DATA HOLDER (RDH) COMPUTES IPD SUMMARY AND DISCLOSE IT


IPD <- airquality[, -6]  # drop redundat variable Day 

# RDH gets IPD in suitable format for analysis 

md <- Return.IPD.design.matrix(IPD)  # do you want to impute NA's later ?

attributes(md)

# COMPRESSION PHASE: RDH computes IPD summary 


IPDsummary <- Return.key.IPD.summaries( md, "moment.corr" )

class(IPDsummary)

names(IPDsummary)

## summaries to be disclosed

moms <- IPDsummary$first.four.moments   

corr <- IPDsummary$correlation.matrix

corr[lower.tri(corr)]  # inspect

n <- IPDsummary$sample.size

supp <- IPDsummary$is.binary.variable

names <- IPDsummary$variable.names

##### LEVEL 2: SUMMARY DATA HOLDER (SDH) RETRIEVE IPD SUMMARY ONLY AND SIMULATES IPD FROM IT



 H <- 300  ## number of artificial IPD copies to be generated


    set.seed(8736, "L'Ecuyer")

 ## Gaussian copula based IPD simulation

 
 system.time(
     
 IPDstar <- DataRebuild( H, n, correlation.matrix = corr, moments = moms,
      x.mode = supp, data.rearrange = "norta",
              corrtype = "moment.corr", marg.model = "gamma",
     variable.names = names, checkdata = TRUE, tabulate.similar.data = TRUE  )

 )



 class(IPDstar)
 names(IPDstar)

 # diagnostic: is input IPD summary well reproduced by artificial IPD ?

 IPDstar$is.similar



 ### REPRODUCE AN ORIGINAL IPD INFERENCE FROM ARTIFICIAL IPD



   formula <- Ozone ~ Solar.R + Wind + Temp + Month

 origIPDinfer <- glm( formula, data = IPD  )

 oom <- summary(origIPDinfer)
  origbeta <- oom$coef[, 1]
 origsd <- oom$coef[, 2]
origaic <- oom$aic

#

 stat <- function( x ){

     x <- as.data.frame(x)
     
 model <- glm( formula, data = x )

     out <- summary(model)
     
  return(
      list( coef= out$coefficient[, 1], sd = out$coefficient[, 2], aic = out$aic )
      )
    }



 ### APPLY STATISTICAL PROCEDURE ON EACH IPD ARTIFICIAL COPY


 IPDinferstar <- lapply( IPDstar$Xspace, function(x) stat(x) )



### extract wanted inferences

  IPDinferstar.out <- lapply(c("coef", "sd", "aic"), function(j){
do.call("rbind",
       lapply(1:H, function(i){

           IPDinferstar[[i]][[j]]
           
       }) )
    }
       )  ; names(IPDinferstar.out) <- c("coef", "sd", "aic")


 ### SUMMARY OPERATION ON SIMULATED INFERENCES

MCmeanINFER <- lapply(IPDinferstar.out, function(x) apply(x, 2, mean, na.rm = TRUE))


### MC standard deviation of coef only 

MCsdINFER <- apply(IPDinferstar.out$coef, 2, sd, na.rm = TRUE)



 #### compare inferences !!!!

data.frame( names= c(rep(names,2), "") ,
 est= c(rep("logbeta", length(origbeta)),
rep("logsdbeta", length(origbeta)), "aic") ,
originalIPD = c( origbeta, origsd, origaic),
reconstructed = unlist(MCmeanINFER) )

bonorico/gcipdr documentation built on May 2, 2021, 8:12 p.m.