R/disclosure.R
In synthpop: Generating Synthetic Versions of Sensitive Microdata for Statistical Disclosure Control

Documented in disclosure disclosure.list disclosure.synds print.disclosure

###-----new version with big denom checks---------------------
disclosure <- function(object, data, ...) UseMethod("disclosure")

###-----disclosure.default-----------------------------
disclosure.default <- function(object, ...)
  stop("No disclosure method associated with class ", class(object), call. = FALSE)

###-----disclosure.data.frame---disclosure.list--------
disclosure.data.frame <- disclosure.list <- 
  function(object, data, cont.na = NULL, keys , target , print.flag = TRUE,
           denom_lim = 5, exclude_ov_denom_lim = FALSE, 
           not.targetlev = NULL,
           usetargetNA = TRUE, usekeysNA = TRUE, 
           exclude.keys =NULL, exclude.keylevs = NULL, exclude.targetlevs = NULL,
           ngroups_target = NULL, ngroups_keys = NULL, 
           thresh_1way = c(50, 90),thresh_2way = c(4, 80),
           digits = 2, to.print =c("short"), compare.synorig = TRUE,...) 
    {
    if (is.null(object)) stop("Requires parameter 'object' to give name of the synthetic data.\n", call. = FALSE)   
    if (is.list(object) & !is.data.frame(object)) m <- length(object)
    else if (is.data.frame(object)) m <- 1
    else stop("object must be a data frame or a list of data frames.\n", call. = FALSE)
    
    # sort out cont.na to make it into a complete named list
    cna <- cont.na
    cont.na <- as.list(rep(NA, length(data)))
    names(cont.na) <- names(data)
    
    if (!is.null(cna)) {
      if (!is.list(cna) | any(names(cna) == "") | is.null(names(cna)))
        stop("Argument 'cont.na' must be a named list with names of selected variables.", call. = FALSE)
      if (any(!names(cna) %in% names(data))) stop("Names of the list cont.na must be variables in data.\n", call. = FALSE)
      for (i in 1:length(cna)) {
        j <- (1:length(data))[names(cna)[i] == names(data)]
        cont.na[[j]] <- unique(c(NA,cna[[i]]))
      }
    }
 ##------------------------------- check with synorig.compare ---------------------------------   
 
    if (compare.synorig) {
    
        if (m ==1) adjust.data <- synorig.compare(object,data, print.flag = FALSE) else
        if (m > 1) adjust.data <- synorig.compare(object[[1]],data, print.flag = FALSE)
        
        if (!adjust.data$unchanged) {
          object <- adjust.data$syn
          data <- adjust.data$orig
          cat("Synthetic data or original or both adjusted with synorig.compare to try to make them comparable\n\n")
          if (m > 1) {cat("only first element of the list has been adjusted and will be used here\n\n")
          m <- 1 }
        }
    } 

    object <- list(syn = object, m = m, cont.na = cont.na) 
    class(object) <- "synds"
    
    res <- disclosure(object, data, keys , target = target , denom_lim = denom_lim,
                      exclude_ov_denom_lim = exclude_ov_denom_lim, 
                      print.flag = print.flag, digits = digits, 
                      usetargetNA = usetargetNA, usekeysNA = usekeysNA,  
                      not.targetlev = not.targetlev,
                      exclude.keys = exclude.keys, exclude.keylevs =  exclude.keylevs,
                      exclude.targetlevs =  exclude.targetlevs, ngroups_target = ngroups_target,
                      ngroups_keys = ngroups_keys, thresh_1way = thresh_1way,thresh_2way = thresh_2way,
                      to.print = to.print, ...) 
    
    res$call <- match.call()
    return(res)
  }
    
###-----disclosure.synds-------------------------
disclosure.synds <-  function(object, data, keys , target , print.flag = TRUE,
                              denom_lim = 5, exclude_ov_denom_lim = FALSE, 
                              not.targetlev = NULL,
                              usetargetNA = TRUE, usekeysNA = TRUE, 
                              exclude.keys =NULL, exclude.keylevs = NULL, exclude.targetlevs = NULL,
                              ngroups_target = NULL, ngroups_keys = NULL, 
                              thresh_1way = c(50, 90),thresh_2way = c(4, 80),
                              digits = 2, to.print =c("short"),...) 
{ 

###-----------------------check input parameters ----
    if (!(is.data.frame(data)) )   stop("data  must be a data frame \n\n", call. = FALSE)
   data <- data.frame(data) ## in case it is table or a tibble
    if (!( inherits(object,"synds")))   stop(" object must be an object of class synds\n\n", call. = FALSE)
   
   if (is.numeric(keys) & !all(keys %in% 1:dim(data)[2])) stop("If keys are numeric they must be in range 1 to ",
                                      dim(data)[2] , call. = FALSE  )
   if (is.numeric(keys)) keys <- names(data)[keys]
   if (is.numeric(target) & !all(target %in% 1:dim(data)[2])) stop("If target is numeric it must be in range 1 to ",
                                                                   dim(data)[2] , call. = FALSE  )
   if (is.numeric(target)) target <- names(data)[target]
  
  Norig <- dim(data)[1]
  if (object$m ==1 ){ names.syn <- names(object$syn) 
  } else {names.syn <- names(object$syn[[1]])}
  # target must be a single variable in  data and object$syn
  # keys must be a vector of variable names in data and in s
  # target must not be in keys
if (!(all(keys %in% names(data)) && all(keys %in% names.syn) 
      && all(target %in% names(data)) & all(target %in% names.syn )))
  stop("keys and target must be variables in data and synthetic data. \n", call. = FALSE)
   if (any(duplicated(keys)))
     stop("keys cannot include duplicated values. \n", call. = FALSE)
  if (!(length(target)==1))
    stop("target must be a single variable \n", call. = FALSE)
   if (target %in% keys)
     stop("target cannot be in keys \n", call. = FALSE)
   if (any(names(data)  == "target"))
     stop("your data have a variables called 'target' please rename in original and synthetic data.\n\n", call. = FALSE) 

   if (!(length(usetargetNA)==1))
     stop("usetargetNA must be a single logical value \n", call. = FALSE)
   if (length(usekeysNA) ==1 ) usekeysNA <- rep(usekeysNA, length(keys))
   if (!(length(usekeysNA)==length(keys)))
     stop("usekeysNA must be a logical value of same length as keys\n", call. = FALSE)
  # get keys and usekeysNA in same order as in data
   
   #cat(keys,"keys\n", usekeysNA,"usekeysNA\n")
   #cat((1:length(names(data)))[names(data) %in% keys],"(1:length(names(data)))[names(data) %in% keys]\n")
   oldkeys <- keys
   keys <- names(data)[(1:length(names(data)))[names(data) %in% keys]]
   usekeysNA <- usekeysNA[match(oldkeys,keys)]

     # check excluded combinations
  if (!is.null(exclude.keys)) {
    if (! length(exclude.keylevs) == length(exclude.keys) & 
        length(exclude.keylevs) == length(exclude.targetlevs) ) stop("All excludes must be the same length\n" , call. = FALSE)
    if (!all(exclude.keys %in% keys)) stop("exclude.keys must be the name of one of your keys", call.= FALSE)
  }
   if(!is.null(denom_lim)){
     if (!(round(denom_lim) == denom_lim && denom_lim > 0 )){
       cat(denom_lim ,"denom_lim\n")
       stop("\ndenom_lim must be an integer >0\n", call.= FALSE)
     }
   }
   
    if (!is.null(ngroups_target)) {
      if(!(length(ngroups_target) ==1 )) stop("\nngroups_target must be a single value", call.= FALSE)
      if( ngroups_target  == 1 )
        stop("\nTarget ngroups cannot be set to 1", call.= FALSE)
    }
   
   if (!is.null(ngroups_keys)) {
     if (length(ngroups_keys) == 1) ngroups_keys <- rep(ngroups_keys, length(keys))
       if(!(length(ngroups_keys) == length(keys) ))
       stop("\nngroups_keys must be an vector of length 1 or same as keys", call.= FALSE)
     if( any( ngroups_keys  ==1) )
       stop("\nElements of ngroups cannot be set to 1", call.= FALSE)
   }
###---------------------- define output items-------------
   
  allCAPs         <- matrix(NA,object$m,5)
  attrib   <- matrix(NA,object$m,8)
  ident  <- matrix(NA,object$m ,4)

  dimnames(allCAPs) <- list(1:object$m, c("baseCAPd","CAPd", "CAPs", "DCAP","TCAP"))
  dimnames(attrib) <- list(1:object$m,c("Dorig","Dsyn","iS", "DiS","DiSCO",  "DiSDiO", "max_denom","mean_denom"))
  dimnames(ident) <- list(1:object$m,c("UiO", "UiS","UiOiS", "repU"))
  Nexclusions <- list(1:object$m)
  Nexclusions <- check_1way <- check_2way <-list(1:object$m)

###-----------------------restrict data sets to targets and keys------------------------------
m <- object$m
if (m == 1) syndata <- list(object$syn) else syndata <- object$syn
  dd <- data ## rename target variable to target
  targetx =dd[, names(dd) == target]
  dd$target <- targetx
 # cat(names(dd),"names(dd)\n")
  dd <- as.data.frame(dd[,c( "target",keys)])
  for (jj in 1:m) {
      targetx = syndata[[jj]][, names(syndata[[jj]]) == target]
      syndata[[jj]]$target <- targetx
      syndata[[jj]] <- as.data.frame(syndata[[jj]][, c( "target",keys)])
  }
###----- get cont.na parameters for stratified synthesis----------------------------------------
  # --------
  if (!is.null(object$strata.syn)) cna <- object$cont.na[1, ] 
  else   cna <- object$cont.na
###------ get cna levels for target and keys  ------------------------

  cna <- cna[c(target,keys)]

  for ( i in 1:length(cna)) {
    nm <- names(cna)[i]
    vals <- unique(cna[[i]][!is.na(cna[[i]])])  # get variables with cont.na other than missing
    if (length(vals) > 0){
      for (j in 1:length(vals))
        n_cna <- sum(vals[j] == data[,nm] & !is.na(data[,nm]))
      if (n_cna == 0) stop("\nValue ", vals[j], " identified as denoting a special or missing in cont.na for ",nm, " is not in data.\n",sep = "", call. = FALSE)
      else if (n_cna < 10 & print.flag) cat ("\nWarning: Only ",n_cna ," record(s) in data with value ",vals[j]," identified as denoting a missing value in cont.na for ",nm, "\n\n", sep = "")
    }
  }
  ###----------------------- group any continuous variables if ngroups not NULL----------------------
  ### needs a seperate loop to fix all syn data sets with the same breaks-----------------
  # Numeric variables
  if ( is.null(ngroups_target) ) ngroups_target <- 0
  if ( is.null(ngroups_keys) )    ngroups_keys  <- 0
  if (length(ngroups_keys) == 1)  ngroups_keys  <- rep(ngroups_keys,length(keys))
  ngroups <- c(ngroups_target,  ngroups_keys)

  if (print.flag) if (any (ngroups >0 & !sapply(dd,is.numeric))) cat("\n\nWith target",target,"variable(s) you have asked to group are not numeric:\n",
                   names(dd)[ngroups >0 & !sapply(dd,is.numeric)]," no grouping done for them.\n")
  if (any(ngroups > 0)){
   togroup <- (1:dim(dd)[2])[ ngroups >0 & sapply(dd,is.numeric) ]
   for (i in togroup) {
      syn0 <- c(sapply(syndata, '[[', i)) ## all synthetic values for ith var
      for (j in 1:m) {
            grpd <- group_num(dd[,i], syndata[[j]][,i], syn0,
                            ngroups[i], cont.na = cna[[i]], ...)
           if (length(table(grpd[[1]])) < 3 ) {
             grpd <- group_num(dd[,i],  syndata[[j]][,i], syn0,
                            cont.na = cna[[i]], n = ngroups[i], style = "equal")
             if (length(table(grpd[[1]])) < 3 ) cat("Only",length(table(grpd[[1]])),"groups produced for", names(data)[j],"even after changing method.\n")
             else if (print.flag) cat("Grouping changed from 'quantile' to  'equal' in function numtocat.syn for",names(data)[j],"because only",length(table(grpd[[1]]))," groups produced\n")
           }
       syndata[[j]][,i] <- grpd[[2]]
      }
      dd[, i] <- grpd[[1]]
      if (print.flag) {
        if (i  == 1) cat("Numeric values of",names(dd)[i],target,"grouped into ", length(table(dd[,i])),"groups\n with levels", names(table(dd[,i])),"\n")
        else cat("Numeric values of key",names(dd)[i],"grouped into ", length(table(dd[,i])),"groups\nwith levels", names(table(dd[,i])),"\n")
      }
   }
}
###--- make any remaining numeric values into factors---------------------------------------

  if (any(sapply(dd,is.numeric))){
    numvars <- (1:dim(dd)[2])[sapply(dd,is.numeric)]
    for ( i in numvars) {
      dd[,i] <- factor(dd[,i])
      for (jj in 1:object$m){ syndata[[jj]][,i] <- factor(syndata[[jj]][,i])
     }
    }
  }
  check1 <- check2 <- ""

###------------------- loop over object$m syntheses---------------

  for ( jj in 1:object$m) {
###-------------------------------- PRINT WHERE AT ----------------------------------   
    if (print.flag) cat("-------------------Synthesis",jj,"--------------------\n") 
    
    ss <- syndata[[jj]]
###----------------------- sort missings-----------------     
###  replace Missing values with factor value of "Missing" to make tables easier
    tomissing <- function(x){
      if (!is.factor(x)) stop("x must be a factor\n",.call =FALSE)
      x <- as.character(x)
      x[is.na(x)] <- "Missing"
      x <- factor(x)
    }
    if ( any(is.na(dd$target)) ) dd$target <- tomissing(dd$target)
    if ( any(is.na(ss$target)) ) ss$target <- tomissing(ss$target)

    for ( i in 1:length(keys)) {
      if ( any(is.na(dd[,names(dd) == keys[i]])) ) dd[,names(dd) == keys[i]] <-
                                      tomissing(dd[,names(dd) == keys[i]])
      if ( any(is.na(ss[,names(ss) == keys[i]])) ) ss[,names(ss) == keys[i]] <-
          tomissing(ss[,names(ss) == keys[i]])

 }

     Nd <- dim(dd)[1]
     Ns <- dim(ss)[1]
###------------------------ make composite variable for keys --------------------------
if (length(keys) >1) {
   ss$keys <- apply(ss[, names(ss) %in% keys],1,function(x) paste(x, collapse = " | "))
   dd$keys <-   apply(dd[, names(dd)  %in% keys],1,function(x) paste(x, collapse = " | "))
} else {
  ss$keys <- ss[, names(ss) == keys]
  dd$keys <- dd[, names(dd) == keys]
}


  
###--------------------- make tables ---------------------------

  NKd <- length(table(dd$keys))
  NKs <- length(table(ss$keys))

  tab_kts <- table(ss$target,ss$keys)
  tab_ktd <- table(dd$target,dd$keys)   ## two way target and keys table orig
  if (print.flag) cat("Table for target",target, "from GT alone with keys has",
                     dim(tab_ktd)[1], "rows", dim(tab_ktd)[2], "colums.\n")
         
  
  ###  get keys in s and d
  #
  Kd <- names(table(dd$keys))
  Ks <- names(table(ss$keys))
  Kboth <- Kd[Kd %in% Ks]
  Kall <- c(Kd,Ks)
  Kall <- Kall[!duplicated(Kall)]

  ### same thing for target
  
  Td <- names(table(dd$target))
  Ts <- names(table(ss$target))
  Tboth <- Td[Td %in% Ts]
  Tall <- c(Td,Ts)
  Tall <- Tall[!duplicated(Tall)]

  ### augment keys tables to match

   if (!(all(Kd %in% Ks))) { ## some original keys not found in synthetic
    extraKd <- Kd[!(Kd %in% Ks) ]
    extra_tab <- matrix(0,dim(tab_kts)[1],length(extraKd))
    dimnames(extra_tab) <- list(dimnames(tab_kts)[[1]],extraKd)
    tab_kts <- cbind(tab_kts,extra_tab)
    tab_kts  <- tab_kts[, order(dimnames(tab_kts)[[2]]), drop = FALSE] 
   }


  if (!(all(Ks %in% Kd))) {  ## extra synthetic keys not in original
    extraKs <- Ks[!(Ks %in% Kd) ]
    extra_tab <- matrix(0,dim(tab_ktd)[1],length(extraKs))
    dimnames(extra_tab) <- list(dimnames(tab_ktd)[[1]],extraKs)
    tab_ktd <- cbind(tab_ktd,extra_tab)
    tab_ktd <- tab_ktd[,order(dimnames(tab_ktd)[[2]]), drop = FALSE]
  }
  if (!(all(Td %in% Ts))) { ## some original target levels not found in synthetic
    extraTd <- Td[!(Td %in% Ts) ]
    if (is.null(dim(tab_kts)))  extra_tab <- matrix(0,length(extraTd),1)
    else extra_tab <- matrix(0,length(extraTd),dim(tab_kts)[2])
    dimnames(extra_tab) <- list(extraTd , dimnames(tab_kts)[[2]])
    tab_kts <- rbind(tab_kts,extra_tab)
    tab_kts  <- tab_kts[order(dimnames(tab_kts)[[1]]), , drop = FALSE] 
  }   else extraTd <- NULL

  if (!(all(Ts %in% Td))) {  ## extra synthetic target levels not in original  ############### 
    extraTs <- Ts[!(Ts %in% Td) ]
    extra_tab <- matrix(0,length(extraTs),dim(tab_ktd)[2],)
    dimnames(extra_tab) <- list(extraTs,dimnames(tab_ktd)[[2]])
    tab_ktd <- rbind(tab_ktd,extra_tab)
    tab_ktd <- tab_ktd[order(dimnames(tab_ktd)[[1]]),] 
  }   else extraTs <- NULL
 
  if(print.flag) cat("Table for target ",target, "from GT & SD with all key combinations has",
                     dim(tab_ktd)[1], "rows", dim(tab_ktd)[2], "colums.\n")  
###------------------------- calculate proportions and margins ---------
  tab_ktd_p <- sweep(tab_ktd,2,apply(tab_ktd,2,sum),"/")
  tab_ktd_p[is.na(tab_ktd_p)] <- 0
  tab_kts_p <- sweep(tab_kts,2,apply(tab_kts,2,sum),"/")
  tab_kts_p[is.na(tab_kts_p)] <- 0

  tab_kd <- apply(tab_ktd,2,sum)
  tab_td<- apply(tab_ktd,1,sum)
  tab_ks <- apply(tab_kts,2,sum)
  tab_ts <- apply(tab_kts,1,sum)

  NKall <- length(tab_kd)
  NKboth <- length(Kboth)
  NTd <- length(Td)
  NTs <- length(Ts)
  Nboth <-  sum(tab_kd[names(tab_kd) %in% Kboth])
  Nd_ins <- sum(tab_kd[names(tab_kd) %in% Ks])
###------------------------get tables for  calculating attribute disclosure measures-----------------------------------------------------

did <- tab_ktd ; did[tab_ktd_p != 1] <- 0
dis <- tab_kts ; dis[tab_kts_p != 1] <- 0
keys_syn <- apply(tab_kts,2,sum)
tab_iS <- tab_ktd
tab_iS[,keys_syn ==0 ] <- 0
tab_DiS <- tab_ktd
anydis <- apply(tab_kts_p,2,function(x) any(x ==1))
tab_DiS[,!anydis] <- 0
tab_DiSCO <- tab_iS
tab_DiSCO[tab_kts_p != 1] <- 0
tab_DiSDiO <- tab_DiSCO
tab_DiSDiO[tab_ktd_p != 1] <- 0


Nout <- rep(0, 6)
names(Nout) = c("excluded target","missing target","missing in  keys", "set to exclude","over denom_lim","remaining")
Nexcludes <- matrix(0,8,6)
dimnames(Nexcludes) <- list(c("original","synthetic","Dorig","Dsyn","iS", "DiS", "DiSCO", "DiSDiO"),
                            c("excluded target","missing target","missing in  keys", "set to exclude","over denom_lim","remaining"))
###----------------------------- now exclusions----------------------------------

tab_exclude <- function(xx,col,Nexcludes) {
 total <- sum(xx)
 ###------------------- drop all target records with not.targetlev---------------------------------
 if (!is.null(not.targetlev) && !(not.targetlev == ""))
  {
   Nout[1] <- sum(xx[dimnames(xx)[[1]] %in% not.targetlev,])
   xx[dimnames(xx)[[1]] %in% not.targetlev,] <- 0
 }
  ###------------------------ items excluded fOr missing values -------------------------
  #
##### missings excluded values from tables if set

if (!usetargetNA && any(dd$target == "Missing")) {
  Nout[2] <- sum(xx[dimnames(xx)[[1]] == "Missing",])
  xx[dimnames(xx)[[1]] == "Missing",] <- 0
}

 for (i in 1:length(keys)){
   if (!usekeysNA[i]) {   ## do not use NA values for ith key
     key_levs <- dimnames(xx)[[2]]
     drop_d <-  word(key_levs,i, sep = fixed(" | ")) == "Missing"
     Nout[3] <- Nout[3] + sum(xx[,drop_d])
     xx[,drop_d] <- 0
  }
}
###-------------------------- remove any excluded two way  combinations----------------------------------
 if (!is.null(exclude.keys)) {

  if (!all(exclude.targetlevs %in% levels(dd$target))) stop("exclude.targetlevs must be one of levels of ",target,"\n", call. =FALSE)
  
   for (i in 1:length(exclude.keys)){
    vout <- (1:(dim(xx)[1]))[dimnames(xx)[[1]] == exclude.targetlevs[i]]
    klev <- levels(dd[, names(dd) == exclude.keys[i]])
    if (!all(exclude.keylevs[i] %in% klev)) stop("exclude.keylevs position ",i, " must be one of levels of ",keys[i],"\n", call. =FALSE)#
    kind <- (1:length(keys))[keys == exclude.keys[i]]
    wordk <- word(dimnames(xx)[[2]], start = rep(kind, dim(xx)[2]), sep = fixed(" | "))
    kout <- (1:dim(tab_ktd)[2])[wordk == exclude.keylevs[i]]
    Nout[4] <- Nout[4] + sum(xx[vout, kout])
    xx[rep(vout,length(kout)), kout] <- 0
   }
 }  

###------------------ exclude if over denom_lim ----------------------------------
 if (exclude_ov_denom_lim)  {
   Nout[5] <- sum(xx[xx > denom_lim ])
   xx[xx > denom_lim ] <- 0
 }
 Nout[6] <-  total -sum(Nout[1:5])
 
###---------------------------- copy Nout into rows of Nexcludes ------------------------------
 Nexcludes[col,] <- Nout
   return(list(tab = xx, Nexcludes = Nexcludes))
}

yy <- tab_exclude(tab_ktd,1,Nexcludes)
tab_ktd <- yy$tab; Nexcludes <- yy$Nexcludes
yy <- tab_exclude(tab_kts,2,Nexcludes)
tab_kts <- yy$tab; Nexcludes <- yy$Nexcludes
yy <- tab_exclude(did,3,Nexcludes)
did <- yy$tab; Nexcludes <- yy$Nexcludes
yy <- tab_exclude(dis,4,Nexcludes)
dis <- yy$tab; Nexcludes <- yy$Nexcludes
yy <- tab_exclude(tab_iS,5,Nexcludes)
tab_iS<- yy$tab; Nexcludes <- yy$Nexcludes
yy <- tab_exclude(tab_DiS,6,Nexcludes)
tab_DiS<- yy$tab; Nexcludes <- yy$Nexcludes
yy <- tab_exclude(tab_DiSCO,7,Nexcludes)
tab_DiSCO <- yy$tab; Nexcludes <- yy$Nexcludes
yy <- tab_exclude(tab_DiSDiO,8,Nexcludes)
tab_DiSDiO <- yy$tab; Nexcludes <- yy$Nexcludes
###--------------------- exclusions done-----------------------------------------
###-----------------------get  identity disclosure measures -------------

tab_ks <- apply(tab_kts,2,sum)
tab_kd <- apply(tab_ktd,2,sum)
tab_ks1 <- tab_ks
tab_ks1[tab_ks1>1] <- 0
tab_kd1 <- tab_kd 
tab_kd1[tab_kd1>1] <- 0
tab_kd1_s <- tab_kd1[names(tab_kd1) %in% Ks]
tab_ksd1 <-tab_kd[tab_ks == 1 & tab_kd == 1] ## repU

tab_ktd_p <- sweep(tab_ktd,2,apply(tab_ktd,2,sum),"/")
tab_ktd_p[is.na(tab_ktd_p)] <- 0
tab_kts_p <- sweep(tab_kts,2,apply(tab_kts,2,sum),"/")
tab_kts_p[is.na(tab_kts_p)] <- 0

UiS<- sum(tab_ks1)/Ns*100
UiO<- sum(tab_kd1)/Nd*100
UiOiS<- sum(tab_kd1_s)/Nd*100
repU <- sum(tab_ksd1)/Nd*100

ident[jj,] <- c( UiO,UiS, UiOiS,repU )

###----------------------------- attrib dis measures-------------------------

Dorig <- sum(did)/Nd*100
Dsyn <- sum(dis)/Ns*100
iS <- sum(tab_iS)/Nd*100
DiS <- sum(tab_DiS)/Nd*100
DiSCO <- sum(tab_DiSCO)/Nd*100
DiSDiO <- sum(tab_DiSDiO)/Nd*100

attrib[jj,] <- c( Dorig,Dsyn,iS,DiS,DiSCO,DiSDiO,max(tab_DiSCO),mean(tab_DiSCO[tab_DiSCO>0]))
Nexclusions[[jj]] <- Nexcludes

###----------------- get  CAP and DCAP measures---------------------

baseCAPd <- sum((tab_td/Nd)**2)*100 ## just from univariates
CAPd <-   sum(apply(tab_ktd_p^2,2,sum)*tab_kd)/Nd*100 ## from tables

CAPs <-   sum(apply(tab_kts_p^2,2,sum)*tab_ks)/Ns*100
DCAP <-   sum(apply(tab_kts_p*tab_ktd,2,sum))/Nd*100
##restrict to uniques in d
tab_kts_pU <- tab_kts_p
tab_kts_pU[,tab_kd != 1] <- 0
TCAP_denom <- sum(tab_kd[tab_ks >0])
#TCAP0 <-   sum(apply(tab_kts_pU*tab_ktd,2,sum))/Nd*100
TCAP <-   sum(tab_DiSCO)/TCAP_denom*100

#cat(baseCAPd,"baseCAPd",TCAP0,"TCAP0",TCAP,"TCAP\n")
allCAPs[jj,] <- c( baseCAPd,  CAPd,  CAPs, DCAP,TCAP)

#if (print.flag) cat("-------------------Disclosure measures completed\n")

#if (print.flag) cat("------------------Now 1 and 2 way checks\n")
###----------------------- checks for most_dis_lev 1 way ---------------------------
  
tab_target <- apply(tab_ktd,1,sum)
tab_dis_target <- apply(tab_DiSCO,1,sum)
pctdisLev <- max(tab_dis_target)/sum(tab_dis_target)*100
totalDisclosive <- sum(tab_dis_target)
PctDisAll <- totalDisclosive/sum(tab_target)*100
tab_dis_target <- sort(tab_dis_target, decreasing = TRUE)
most_dis_lev <- names(tab_dis_target)[1]
PctLevelAll <- tab_target[names(tab_target) == most_dis_lev]/sum(tab_target)*100
PctLevelDis <- tab_dis_target[names(tab_dis_target) == most_dis_lev]/sum(tab_dis_target)*100
nLevelDis <-tab_dis_target[names(tab_dis_target) == most_dis_lev]

if (nLevelDis > thresh_1way[1] &  PctLevelDis > thresh_1way[2]) {
    check_1way[[jj]] <- data.frame(Level = most_dis_lev, All = sum(tab_target) ,PctLevelAll =  PctLevelAll,
                                totalDisclosive =  totalDisclosive, nLevelDis = nLevelDis,PctLevelDis =  PctLevelDis)
    check1 <- paste("Check " ,target," level ",most_dis_lev)
}
else check_1way[[jj]] <- ""
#if (print.flag) cat("------------------One-way checks completed\n")
###------------------------get details for check_2way-----------------
### only implemented for DiSCO  though could be changed

xx <- tab_DiSCO
if (any(xx > thresh_2way[1]))  {
  
  denoms <- as.vector(xx[xx > thresh_2way[1]])
  rows <- rep(1:dim(xx)[1],dim(xx)[2])
  rows <- rows[xx > thresh_2way[1]]
  cols <- rep(1:dim(xx)[2],rep(dim(xx)[1],dim(xx)[2]))
  cols <- cols[xx > thresh_2way[1]]
  
  target_levs = dimnames(xx)[[1]][rows]
  key_levs = dimnames(xx)[[2]][cols]
  temp <- data.frame(denoms,target_levs,key_levs)
  for (i in 1:length(keys)) {
    temp[,3+i] <- paste(target_levs, word(key_levs,i, sep = fixed(" | ")), sep="|")
    if (i ==1) allpairs <- data.frame(npairs = table(rep(temp[,3+i],denoms)),keys = keys[i])
    else allpairs <- rbind(allpairs, data.frame(npairs = table(rep(temp[,3+i],denoms)),keys = keys[i]))
  }
  names(allpairs)<- c("target_key_levs","npairs","key")
  allpairs <- allpairs[order(-allpairs$npairs),]
  allpairs$target_lev <- word(allpairs$target_key_levs, 1,sep=fixed("|"))
  allpairs$key_lev <- word(allpairs$target_key_levs, 2,sep=fixed("|"))
  allpairs$key_total <- allpairs$key_target_total <- 0

  for (ii in 1:dim(allpairs)[1]) {

    tttab <- table(dd$target[dd[,names(dd)==allpairs$key[ii]] == allpairs$key_lev[ii] ])
    allpairs$key_total[ii] <- sum(tttab)
    allpairs$key_target_total[ii] <- tttab[names(tttab) == allpairs$target_lev[ii]]
     }

  allpairs$PctTargetKeyLevel <- allpairs$key_target_total*100/allpairs$key_total
 #


  details <- allpairs[allpairs$PctTargetKeyLevel > thresh_2way[2],c(1:3,6:8)]
  if (dim(details)[1] >0 ) {
    check_2way[[jj]] <- details 
    check2 <- details[1,]
    check2 <- paste(dim(details)[1],"pairs need checks")
  }
  else  check_2way[[jj]] <- ""
} 
else  check_2way[[jj]] <- ""

#if (print.flag) cat("----------------------------Two-way checks completed.\n")
###-------------------------- end of jj loop -----------------------------------

} ######## end of jj loop
################################################ save ###############################
if (object$m ==1){
  if (!check1 == "") check_1way = check_1way[[1]]
  if (!check2 == "") check_2way = check_2way[[1]]
}
    
res <- list(ident = data.frame(ident), attrib = data.frame(attrib), 
            allCAPs = data.frame(allCAPs), check_1way = check_1way, 
            check1 = check1, check_2way = check_2way,
            check2 =check2, Nexclusions = Nexclusions,
            keys = keys, target = target, digits = digits, Norig = Norig,
            to.print = to.print, call = match.call())
class(res) <- "disclosure"
return(res)
}
###---------------------------print.disclosure---------------------------- 
print.disclosure <- function(x, to.print = NULL, digits = NULL,   ...)
{
  if (is.null(to.print)) to.print <- x$to.print
  if (is.null(digits)) digits <- x$digits

  if (!all(to.print %in% c("short","allCAPs","ident","attrib",
                           "check_2way","check_1way","all","exclusions")))   
  stop("to.print must be  choices from 'short','allCAPs',
        'ident','attrib', 'check_2way','check_1way','all','exclusions'\n", call. = FALSE)
  
  if (length(to.print) >1 & ("short" %in% to.print)) stop("A 'Short' entry in to.print should not be combined with other options.\n", call. = FALSE)
  
  
  cat("Disclosure measures from synthesis for",x$Norig, "records in original data.\n")

  
  nexcluded <- sapply(x$Nexclusions,function(x) sum(x[,-6]))
  if  (any(nexcluded>0)) {
    cat("\nSome records excluded from the evaluation of disclosure risks")
    cat("\nSee details by adding 'exclusions' to the parameter to.print of disclosure,
    or by printing the details with to,print including 'exclusions'.")
  }
  
  if (length(to.print) == 1 && to.print == "short") {
    cat("\nIdentity  measures for keys", x$keys,
        "\nand attribute measures for",x$target,"from the same keys\n" )
    short <- rbind(c(x$ident[1,1],x$attrib[1,1]),
                   cbind(x$ident[,4],x$attrib[,5]))
    dimnames(short) <- list(c("Original",paste("Synthesis", 1:dim(x$attrib)[1])),c("Identity (UiO/repU)","Attrib (Dorig/DiSCO)"))
    print(round(short, digits))

  }
  if ("all" %in% to.print) {
    cat("\nOutput from the following call to function disclosure()")
    print(x$call)
  }
    
    if (any(c("ident","all") %in% to.print))  { 
      cat("\nIdentity disclosure measures for", dim(x$ident)[[1]] ,"synthetic data set(s) from keys:\n", x$keys,"\n\n")
      print(round(x$ident, digits))
      }
    
    if (any(c("attrib","all") %in% to.print)) {
      cat("\nAttribute disclosure measures for",x$target,"from keys:",x$keys,"\n")
       print(round(x$attrib, digits))
    }
if (any(x$Nidentexclude >0)) {
  cat("\nNumber of records excluded from identity disclosure for missing keys\n")
  print(x$Nidentexclude)  
}
###---------------- print CAP measures-----------------------------------------
  if (any(c("allCAPs","all") %in% to.print)) {
    cat("\nCAP measures for the target", x$target, "with keys\n")
    cat(x$keys,"\n\n")
    print(round(x$allCAPs, digits))
  }   
  
  ###------------------------ print check messages  
  if (x$check1 != "" & !("check_1way" %in% to.print)){
    cat("\nThe 1 way distributions of",x$target,"has a large contribution to disclosure from level",x$check1,"\n")
    cat("Please add 'check_1way' to to.print (e.g. print(disclosure_result, to.print = 'check_1way'),\n")
    cat("and look at original data to decide if backround knowledge would make this disclosure likely\n")
    cat("Consider excluding this level with the not.targetlev parameter to the disclosure function.\n\n")
  }
       
  if ("check_1way" %in% to.print){
      if (!is.null(x$check_1way) ) {
      cat("\nDetails of target level contributing disproportionately to disclosure\n")
      print(x$check_1way)
      }
      else {cat("No 1 way checks for your target are flagged at your settings for thresh_1way\n\n ")}
  }
  
  if (!(length(x$check2) == 1 && x$check2 == "") & !("check_2way" %in% to.print)){
    cat("\nDetails of target-key pairs contributing disproportionately to disclosure of",x$target,"\n")
    if (length(x$check2) > 3) cat("Note only the first 3 printed here\n")
    for (i in 1:(min(length(x$check2),3))) cat(x$check2[i],"\n")
    cat("\nPlease examine component $check_2way of the disclosure object and look at original data.\n")
    cat("Consider excluding these key-target pairs with some the following parameters to disclosure:\n")
    cat("exclude_ov_denom_lim = TRUE or defining key-target combinations from exclude.targetlevs,\n")
    cat("exclude.keys and exclude.keylevs\n\n" )
  }
  
  if ("check_2way" %in% to.print){
      cat("\nDetails of target-key combinations contributing disproportionately to disclosure\n")
      if (!is.null(x$check_2way)) {
        cat("This is a list with one for each of",dim(x$check_2way),"syntheses\n")
        print(x$check_2way)
       }
      else {cat("No 2 way checks of key-target combinations are flagged at your settings for thresh_2way\n\n ")}
    }
  ###----------------------  print exclusions -----------------
  if ("exclusions" %in% to.print){
    for (i in 1:length(x$Nexclusions)){
      if (sum(x$Nexclusions[[i]]) > 0 ) {
        cat("\nRecords excluded from attribute exclusion measures for synthesis",i,"\n") 
        print(x$Nexclusions[[i]])}
      else cat("\nNo records excluded from attribute exclusion measures for synthesis",i,"\n")
    }
  }

  
  invisible(x)
}
Any scripts or data that you put into this service are public.
synthpop documentation built on June 8, 2025, 1:31 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
synthpop
Generating Synthetic Versions of Sensitive Microdata for Statistical Disclosure Control

R/disclosure.R
In synthpop: Generating Synthetic Versions of Sensitive Microdata for Statistical Disclosure Control

Defines functions print.disclosure disclosure.synds disclosure.list disclosure.default disclosure

Documented in disclosure disclosure.list disclosure.synds print.disclosure

Try the synthpop package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

synthpop Generating Synthetic Versions of Sensitive Microdata for Statistical Disclosure Control

R/disclosure.R In synthpop: Generating Synthetic Versions of Sensitive Microdata for Statistical Disclosure Control

Defines functions print.disclosure disclosure.synds disclosure.list disclosure.default disclosure

Documented in disclosure disclosure.list disclosure.synds print.disclosure

Try the synthpop package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

synthpop
Generating Synthetic Versions of Sensitive Microdata for Statistical Disclosure Control

R/disclosure.R
In synthpop: Generating Synthetic Versions of Sensitive Microdata for Statistical Disclosure Control