source("rpacksifter-2.R")
source("repSifter_reem.R")
source("funcs_sim.R")
datals <- data.gen(genmodel = "l",noisel = "s",sample=500,seed=123)
mils <- mi2l.pan(data=datals,mispct=0.2)
#Function to recognize the static location in multiple imputed datasets.
static_loc <- function(mi_object){
allimp <- complete(mi_object,action ="long")
s_locY <- allimp %>% group_by(.id)%>% summarise(static = (length(unique(Y))==1))
s_locK <- allimp %>% group_by(.id)%>% summarise(static = (length(unique(K))==1))
return(list(Y=s_locY,K=s_locK))
}
#Function to calculate the abs(hat.K - K) on average.
#The hat.K is based on whether the location has static values in MI data
#hat.K only depends on x4,x5 and visit for nonstatic locations and depends on Y,x4,x5 on static locations.
#for DataSifter all cells are nonstatic.
dis_risk <-function(rawdata,miobj,siftdata,siftdata1,simtype=c("l","n"),testrn=50){
#Datasets ready
n=nrow(rawdata)
modelform <- ifelse(simtype=="l","K~Y+x4+x5+visit+(1|ID)","K~sin(Y)+exp(cos(x4))+Y*abs(x5)+visit+(1|ID)")
modelformY <- ifelse(simtype=="l","Y~x1+x2+x3+visit+(1|ID)","Y~sin(x1)+(x2)^2+x1*abs(x3)+visit+(1|ID)")
mi_sloc<-static_loc(miobj)
mi_slocY <- mi_sloc$Y[1:testrn,]
mi_slocK <- mi_sloc$K[1:testrn,]
midata <- complete(miobj,action ="long")
midata <- midata %>% mutate(rowid=rep(1:n,max(midata$.imp)))
midata <- midata %>% filter(rowid %in% c(1:testrn))
siftdata <- siftdata %>% mutate(rowid=1:n)
siftdata$ID <- as.numeric(siftdata$ID)
siftdata <- siftdata[1:testrn,]
siftdata1 <- siftdata1 %>% mutate(rowid=1:n)
siftdata1$ID <- as.numeric(siftdata1$ID)
siftdata1 <- siftdata1[1:testrn,]
#Construct models based on D_{-i}
models <- sapply(1:testrn, function(x) lmer(modelform,data=rawdata[-x,]))
modelsY <- sapply(1:testrn, function(x) lmer(modelformY,data=rawdata[-x,]))
#{sid <- sample(c(1:n)[-x],floor(0.5*n))
rawdata <- rawdata[1:testrn,]
#predictions based on models
cidrow <- which(names(midata)=="rowid")
pred_mi <- data.frame(rowid=midata$rowid,k_i=apply(midata,1,function(x) {
x<-data.frame(matrix(x,nrow = 1))
colnames(x)<- colnames(midata)
if(mi_slocK$static[x[,cidrow]]==TRUE){
rawdata$K[x[,cidrow]]
}else{
predict(models[[x[,cidrow]]],newdata=x,allow.new.levels=TRUE)}}),
y_i=apply(midata,1,function(x) {
x<-data.frame(matrix(x,nrow = 1))
colnames(x)<- colnames(midata)
if(mi_slocY$static[x[,cidrow]]==TRUE){
rawdata$Y[x[,cidrow]]
}else{
predict(modelsY[[x[,cidrow]]],newdata=x,allow.new.levels=TRUE)}}))
pred_mi <- pred_mi %>% group_by(rowid) %>% summarise(mik_i=mean(k_i),miy_i=mean(y_i))
cidrowsif <- which(names(siftdata)=="rowid")
pred_ds <- data.frame(rowid=siftdata$rowid,K=rawdata$K,Y=rawdata$Y,dsk_i=apply(siftdata,1,function(x) {
x<-data.frame(matrix(x,nrow = 1))
names(x)<- names(siftdata)
predict(models[[unlist(x[,cidrowsif])]],newdata=x,allow.new.levels=TRUE)}),
dsy_i=apply(siftdata,1,function(x) {
x<-data.frame(matrix(x,nrow = 1))
names(x)<- names(siftdata)
predict(modelsY[[unlist(x[,cidrowsif])]],newdata=x,allow.new.levels=TRUE)}))
pred_ds1 <- data.frame(rowid=siftdata1$rowid,ds1k_i=apply(siftdata1,1,function(x) {
x<-data.frame(matrix(x,nrow = 1))
names(x)<- names(siftdata1)
predict(models[[unlist(x[,cidrowsif])]],newdata=x,allow.new.levels=TRUE)}),
ds1y_i=apply(siftdata1,1,function(x) {
x<-data.frame(matrix(x,nrow = 1))
names(x)<- names(siftdata)
predict(modelsY[[unlist(x[,cidrowsif])]],newdata=x,allow.new.levels=TRUE)}))
predfull <- pred_ds %>% left_join(pred_ds1,by="rowid") %>% left_join(pred_mi,by="rowid") %>%
mutate(midr_K=abs(mik_i-K),dsdr_K=abs(dsk_i-K),ds1dr_K=abs(ds1k_i-K),
midr_Y=abs(miy_i-Y),dsdr_Y=abs(dsy_i-Y),ds1dr_Y=abs(ds1y_i-Y))
return(predfull)
}
####TEST######
mispct=0.2
sample=500
seed=123
datals <- data.gen(genmodel = "l",noisel = "s",sample=sample,seed=seed)
dsreemls <- repSifter(data=datals,mispct=mispct,misw="perij",lnames=c("Y","K"),timevar="visit",ID="ID",reem = TRUE,maxit = 3)
mils <- mi2l.pan(data=datals,mispct=mispct)
dr <- dis_risk(rawdata = datals,miobj=mils,siftdata = dsreemls[[1]],siftdata1=dsreemls[[1]],simtype = "l")
mean(dr$midr)
mean(dr$dsdr)
mean(dr$ds1dr)
####Actual numbers#####
setwd("~/Box Sync/Longitudinal Sifter/MI_data_obfuscation/result/")
lsdr_full_500 <- data.frame()
lsdr_full_1000 <- data.frame()
lldr_full_500 <- data.frame()
lldr_full_1000 <- data.frame()
nsdr_full_500 <- data.frame()
nsdr_full_1000 <- data.frame()
nldr_full_500 <- data.frame()
nldr_full_1000 <- data.frame()
for(i in c(1:100)){
load(paste("simdata",i,".Rdata",sep = "")) #load sim500 sim1000 sim2000
#load n=500
lsraw500<-sim500$ls$raw
llraw500<-sim500$ll$raw
nsraw500<-sim500$ns$raw
nlraw500<-sim500$nl$raw
lsreem500<-sim500$ls$dsreem
llreem500<-sim500$ll$dsreem
nsreem500<-sim500$ns$dsreem
nlreem500<-sim500$nl$dsreem
lsmi500<-sim500$ls$mi
llmi500<-sim500$ll$mi
nsmi500<-sim500$ns$mi
nlmi500<-sim500$nl$mi
#load n=1000
lsraw1000<-sim1000$ls$raw
llraw1000<-sim1000$ll$raw
nsraw1000<-sim1000$ns$raw
nlraw1000<-sim1000$nl$raw
lsreem1000<-sim1000$ls$dsreem
llreem1000<-sim1000$ll$dsreem
nsreem1000<-sim1000$ns$dsreem
nlreem1000<-sim1000$nl$dsreem
lsmi1000<-sim1000$ls$mi
llmi1000<-sim1000$ll$mi
nsmi1000<-sim1000$ns$mi
nlmi1000<-sim1000$nl$mi
load(paste("lmm/simdata_lmm",i,".Rdata",sep = ""))
lslmm500<-sim500$ls$dslmm
lllmm500<-sim500$ll$dslmm
nslmm500<-sim500$ns$dslmm
nllmm500<-sim500$nl$dslmm
lslmm1000<-sim1000$ls$dslmm
lllmm1000<-sim1000$ll$dslmm
nslmm1000<-sim1000$ns$dslmm
nllmm1000<-sim1000$nl$dslmm
#Calculate DR for n=500
lsdr500 <- dis_risk(rawdata = lsraw500,miobj=lsmi500,siftdata = lsreem500,siftdata1=lslmm500,simtype = "l")
lldr500 <- dis_risk(rawdata = llraw500,miobj=llmi500,siftdata = llreem500,siftdata1=lllmm500,simtype = "l")
nsdr500 <- dis_risk(rawdata = nsraw500,miobj=nsmi500,siftdata = nsreem500,siftdata1=nslmm500,simtype = "n")
nldr500 <- dis_risk(rawdata = nlraw500,miobj=nlmi500,siftdata = nlreem500,siftdata1=nllmm500,simtype = "n")
lsdr_full_500<- rbind(lsdr_full_500,cbind(lsdr500,rep=i))
lldr_full_500<- rbind(lldr_full_500,cbind(lldr500,rep=i))
nsdr_full_500<- rbind(nsdr_full_500,cbind(nsdr500,rep=i))
nldr_full_500<- rbind(nldr_full_500,cbind(nldr500,rep=i))
#Calculate DR for n=100
lsdr1000 <- dis_risk(rawdata = lsraw1000,miobj=lsmi1000,siftdata = lsreem1000,siftdata1=lslmm1000,simtype = "l")
lldr1000 <- dis_risk(rawdata = llraw1000,miobj=llmi1000,siftdata = llreem1000,siftdata1=lllmm1000,simtype = "l")
nsdr1000 <- dis_risk(rawdata = nsraw1000,miobj=nsmi1000,siftdata = nsreem1000,siftdata1=nslmm1000,simtype = "n")
nldr1000 <- dis_risk(rawdata = nlraw1000,miobj=nlmi1000,siftdata = nlreem1000,siftdata1=nllmm1000,simtype = "n")
lsdr_full_1000<- rbind(lsdr_full_1000,cbind(lsdr1000,rep=i))
lldr_full_1000<- rbind(lldr_full_1000,cbind(lldr1000,rep=i))
nsdr_full_1000<- rbind(nsdr_full_1000,cbind(nsdr1000,rep=i))
nldr_full_1000<- rbind(nldr_full_1000,cbind(nldr1000,rep=i))
print(i)
}
#save(lsdr_full_500,
# lsdr_full_1000,
# lldr_full_500,
# lldr_full_1000,
# nsdr_full_500,
# nsdr_full_1000,
# nldr_full_500,
# nldr_full_1000,file = "../disclosurerisk.Rdata")#1-100
namechar <- list(c("lsdr","500"),
c("lsdr","1000"),
c("lldr","500"),
c("lldr","1000"),
c("nsdr","500"),
c("nsdr","1000"),
c("nldr","500"),
c("nldr","1000"))
j=1
library(dplyr)
fulldata <- data.frame()
temp <- lapply(list(lsdr_full_500,lsdr_full_1000,lldr_full_500,lldr_full_1000,nsdr_full_500,nsdr_full_1000,nldr_full_500,nldr_full_1000),function(i){
i%>% group_by(rep) %>% summarize(midr_K=mean(midr_K),midr_Y=mean(midr_Y),dsreem_K=mean(dsdr_K),dsreem_Y=mean(dsdr_Y),dsglmm_K=mean(ds1dr_K),dsglmm_Y=mean(ds1dr_Y))
#temp$setting=namechar[[j]][1]
#temp$n=namechar[[j]][2]
#fulldata <- rbind(fulldata,temp)
#j=j+1
})
for(i in c(1:length(namechar))){
temp[[i]]$setting=namechar[[i]][1]
temp[[i]]$n=namechar[[i]][2]
}
full_data <- do.call(rbind,temp)
#save(full_data,file = "../full_privacy_data.Rdata")
##############Plotting#################
library(ggplot2)
library(reshape2)
library(dplyr)
load("full_privacy_data.Rdata")
full_data$setting<-ifelse(full_data$setting=="lsdr","Linear Small Noise",
ifelse(full_data$setting=="lldr","Linear Large Noise",
ifelse(full_data$setting=="nsdr","Nonlinear Small Noise",
"Nonlinear Large Noise")))
full_data_long <- melt(full_data,id.vars=c("rep","setting","n"))
full_data_long <- full_data_long %>% mutate(lvar = sub("^[a-z]*_","",variable),
model = sub("(dr)?_[A-Z]","",variable))
full_data_long$setting <- factor(full_data_long$setting,levels=unique(full_data_long$setting))
full_data_long$n <- factor(full_data_long$n,levels=c(500,1000),labels = c("n=500","n=1000"))
ggplot(data = full_data_long,aes(x=lvar,y=value,fill=model))+geom_boxplot(lwd=0.2)+
facet_grid(n~setting,scales="free_y")+
scale_fill_manual(name="Method:",values = c("green","lightblue","red"))+
xlab("Longitudinal Variable Type")+
ylab("Mean Privacy Measurement")+
theme_bw()
View(full_data_long %>% group_by(setting,n,variable) %>% summarise(mean(value)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.