sdcTable: Methods for Statistical Disclosure Control in Tabular Data

#' @aliases calc.multiple,character,list-method
#' @rdname calc.multiple-method
setMethod(f='calc.multiple', signature=c('character', 'list'),
  definition=function(type, input) {
    .SD <- ID <- NULL
    if (!type %in% c('makePartitions', 'genMatMFull',
        'makeAttackerProblem', 'calcFullProblem') ) {
      stop("calc.multiple:: argument 'type' is not valid!\n")
    }

    if ( type == 'makePartitions' ) {
      return(c_make_partitions(input))
    }
    if ( type == 'genMatMFull' ) {
      return(c_gen_mat_m(input))
    }
    if ( type == 'makeAttackerProblem' ) {
      return(c_make_att_prob(input))
    }
    if ( type == 'calcFullProblem' ) {
      return(c_calc_full_prob(input))
    }
  }
)

setMethod("c_make_partitions", signature=c("list"), definition=function(input) {
  pI <- input$objectA
  dimInfoObj <- input$objectB
  dimInfo <- g_dim_info(dimInfoObj)
  strIDs <- g_strID(pI)

  ## create classes and groups
  tmpDat <- expand.grid(lapply(1:length(dimInfo), function(x) { 1:g_nr_levels(dimInfo[[x]]) } ))
  groups <- apply(tmpDat, 1, function(x) { paste(x, collapse="-")})
  classes <- apply(tmpDat, 1, sum)
  sortOrder <- order(classes)
  classes <- classes[sortOrder]
  classesUnique <- unique(classes)
  groups <- groups[sortOrder]
  splitGroups <- split(groups, classes)

  ## create tables for all classes and groups
  final <- list()
  final$groups <- as.list(groups)
  final$indices <- list()

  # default_codes and levels
  default_codes <- lapply(1:length(dimInfo), function(x) {
    g_default_codes(dimInfo[[x]])
  })
  dim_levels <- lapply(1:length(dimInfo), function(x) {
    g_levels(dimInfo[[x]])
  })

  # data.table to merge on
  df <- data.table(N=1:length(strIDs), strIDs=strIDs)
  setkey(df, strIDs)

  for (i in 1:length(groups)) {
    final$indices[[i]] <- list()
    levs <- as.integer(unlist(sapply(groups[[i]], strsplit, "-")))

    res <- list()
    for (z in 1:length(dimInfo)) {
      res[[z]] <- list()
      index <- which(g_levels(dimInfo[[z]]) %in% c(levs[z], levs[z]-1))
      codesDefault <- default_codes[[z]][index]
      if (levs[z] == 1) {
        res[[z]] <- codesDefault
      } else {
        levOrig <- dim_levels[[z]][index]
        diffs <- c(0,diff(levOrig))
        checkInd <- which(diffs == 1)-1
        out <- data.frame(index=index, levOrig=levOrig, codesDefault=codesDefault, ind=NA)
        out$ind[checkInd] <- 1

        checkInd <- c(checkInd, length(index))
        splitVec <- rep(0, length(index))
        for ( j in 2:length(checkInd) ) {
          if ( j < length(checkInd) ) {
            splitVec[checkInd[j-1]:(checkInd[j]-1)] <- j-1
          } else {
            splitVec[checkInd[j-1]:(checkInd[j])] <- j-1
          }
        }
        spl <- split(index, splitVec)
        counter <- 1
        for (k in 1:length(spl)) {
          rowInd <- match(spl[[k]], out$index)
          tmp <- out[rowInd,]
          if ( any(tmp[,"levOrig"]==levs[z]) ) {
            tmp <- tmp[1:(max(which(tmp$levOrig==levs[z]))),]
            res[[z]][[length(res[[z]])+1]] <- sort(unique(as.character(tmp$codesDefault)))
          }
        }
      }
    }
    final$indices[[i]] <- list()
    combs <- expand.grid(lapply(1:length(res), function(x) {
      1:length(res[[x]])
    }))

    final$indices[[i]] <- list();
    length(final$indices[[i]]) <- nrow(combs)
    for (m in 1:nrow(combs)) {
      final.strIDs <- pasteStrVec(expand(lapply(1:ncol(combs), function(x) {
        res[[x]][[combs[m,x]]]
      })), ncol(combs))
      df2 <- data.table(strIDs=final.strIDs, key="strIDs")
      final$indices[[i]][[m]] <- df[df2]$N
    }
  }
  final$nrGroups <- length(groups)
  final$nrTables <- sum(sapply(1:final$nrGroups, function(x) {
    length(final$indices[[x]])
  }))
  return(final)
})

setMethod("c_gen_mat_m", signature=c("list"), definition=function(input) {
  x <- input$objectA
  y <- input$objectB

  levelObj <- g_dim_info(y)
  strID <- g_strID(x)
  nrVars <- length(levelObj)
  nrCells <- g_nrVars(x)
  freqs <- g_freq(x)

  constraintM <- init.simpleTriplet(type='simpleTriplet', input=list(mat=matrix(0, nrow=0, ncol=nrCells)))
  for ( i in 1:nrVars ) {
    lO <- levelObj[[i]]
    keepList <- lapply(g_str_info(y)[-i], function(k) {
      seq(k[1], k[2])
    })
    keepList2 <- lapply(g_str_info(y)[i], function(k) {
      seq(k[1], k[2])
    })
    f1 <- f2 <- mySplitIndicesList(strID, keepList2)

    if ( nrVars > 1 ) {
      f1 <- mySplitIndicesList(strID, keepList)
    }

    dimlO <- g_dims(lO)
    if ( length(unique(f2)) != 1 ) {
      dimInd <- sapply(1:length(dimlO), function(x) { identical( sort(unique(f2)), dimlO[[x]]) } )
      if ( sum(dimInd) == 0 ) {
        for ( j in 1:length(g_dims(lO)) ) {
          splitInd <- which(f2 %in% g_dims(lO)[[j]])
          spl <- split(splitInd, f1[splitInd])
          for ( z in 1:length(spl) ) {
            ind <- rep(1,length(spl[[z]]))
            ind[which.max(freqs[spl[[z]]])] <- -1
            if ( !is.zero(sum(freqs[spl[[z]]]*ind)) ) {
              stop("something went wrong!\n")
            }
            constraintM <- c_add_row(constraintM, input=list(index=spl[[z]], values=ind))
          }
        }
      } else {
        splitInd <- which(f2 %in% g_dims(lO)[[which(dimInd==TRUE)]])
        ## only 1 dimension
        if ( nrVars > 1 ) {
          spl <- split(splitInd, f1[splitInd])
        } else {
          spl <- split(splitInd, rep(1, length(splitInd)))
        }

        for ( z in 1:length(spl) ) {
          ind <- rep(1,length(spl[[z]]))
          ind[which.max(freqs[spl[[z]]])] <- -1
          if ( !is.zero(sum(freqs[spl[[z]]]*ind)) ) {
            stop("something went wrong! (z=",z," und names(spl)[z]='",names(spl)[z],")\n")
          }
          constraintM <- c_add_row(constraintM, input=list(index=spl[[z]], value=ind))
        }
      }
    }
  }
  return(constraintM)
})

setMethod("c_make_att_prob", signature=c("list"), definition=function(input) {
  x <- input$objectA
  y <- input$objectB
  nrVars <- g_nrVars(x)
  A <- c_gen_mat_m(input=list(objectA=x, objectB=y))

  ## calculating (logical) constraints for the master problem ##
  # idea: for each constraint at least 2 suppressions must
  # exist if one xi != 0! (http://www.eia.doe.gov/ices2/missing_papers.pdf)
  newCutsMaster <- init.cutList(type='empty', input=list(nrCols=nrVars))
  #xx <- lapply(1:g_nr_rows(A), function(x) {
  # cols <- g_col_ind(g_row(A, input=list(x)))
  # v <- rep(0, nrVars)
  # v[cols] <- c(1, rep(-1, length(cols)))
  # s_add_complete_constraint(newCutsMaster) <<- list(init.cutList(type='singleCut', input=list(vals=v, dir="<=", rhs=0)))
  #})
  ################################################################

  nrConstraints <- g_nr_rows(A)
  objective <- rep(0, length=2*nrVars+nrConstraints)
  z1 <- init.simpleTriplet(type='simpleTripletDiag', input=list(nrRows=nrVars, negative=FALSE))
  z2 <- init.simpleTriplet(type='simpleTripletDiag', input=list(nrRows=nrVars, negative=TRUE))
  z <- c_bind(object=z1, input=list(z2, bindRow=FALSE))
  A <- c_bind(object=z, input=list(g_transpose(A), bindRow=FALSE))
  direction <- rep("==", g_nr_rows(A))
  rhs <- rep(0, g_nr_rows(A))

  types <- rep("C", g_nr_cols(A))
  boundsLower <- list(ind=1:g_nr_cols(A), val=c(rep(0, 2*nrVars), rep(-Inf, nrConstraints)))
  boundsUpper <- list(ind=1:g_nr_cols(A), val=c(rep(Inf, 2*nrVars), rep(Inf,  nrConstraints)))

  aProb <- new("linProb",
    objective=objective,
    constraints=A,
    direction=direction,
    rhs=rhs,
    boundsLower=boundsLower,
    boundsUpper=boundsUpper,
    types=types)
  return(list(aProb=aProb, newCutsMaster=newCutsMaster))
})

setMethod("c_calc_full_prob", signature=c("list"), definition=function(input) {
  .SD <- ID <- id <- NULL
  x <- input$objectA
  y <- input$objectB
  time.start <- proc.time()
  datO <- g_raw_data(x)
  dimObj <- g_dim_info(y)

  # we have to aggregate if we are dealing with microdata
  if (g_is_microdata(x)) {
    rawData <- datO[, lapply(.SD, sum, na.rm=TRUE), by=key(datO), .SDcols=setdiff(colnames(datO), key(datO))]
  } else {
    rawData <- copy(datO)
  }
  ind.dimvars <- g_dimvar_ind(x)
  ind.freq <- g_freqvar_ind(x)

  codes <- list(); length(codes) <- length(ind.dimvars)
  for (i in 1:length(codes)) {
    codes[[i]] <- rawData[[ind.dimvars[i]]]
    cDefault <- g_default_codes(dimObj[[i]])
    cOriginal <- g_original_codes(dimObj[[i]])
    cOriginalDups <- g_dups(dimObj[[i]])
    cOriginalDupsUp <- g_dups_up(dimObj[[i]])
    if (all(unique(codes[[i]]) %in% c(cOriginal, cOriginalDups))) {
      codes[[i]] <- c_match_default_codes(object=dimObj[[i]], input=rawData[[ind.dimvars[i]]])
      if (sum(is.na(codes[[i]]))>0) {
        stop(paste0("NA values in default codes have been generated for variable ",
          shQuote(names(dimObj)[i]),".\nPlease check the definition of this hierarchy!\n"))
      }
    } else if (all(unique(codes[[i]]) %in% cDefault)) {
      # cat("no recoding necessary!\n")
    } else {
      stop("c_calc_full_prob:: recoding not possible. Check your inputs!\n")
    }
  }

  ## calculate all possible combinations within the lowest levels of dim-vars
  ## if any combinations are missing (missing.codes), we have to set them to 0 later
  strID <- as.character(pasteStrVec(unlist(codes), length(codes)))
  exDims <- pasteStrVec(unlist(codes), length(codes))
  possDims <- sort(pasteStrVec(as.character(expand(lapply(dimObj, function(x) {
    g_minimal_default_codes(x)
  }), vector=TRUE)), length(dimObj)))
  missing.codes <- setdiff(possDims, exDims)

  ## fill the table
  nrIndexvars <- length(ind.dimvars)
  fullDims <- lapply(dimObj, g_dims)

  allCodes <- expand(lapply(dimObj, g_default_codes), vector=FALSE)
  fullTabObj <- data.table(ID=1:length(allCodes[[1]]))
  for (i in 1:length(allCodes)) {
    fullTabObj[,colnames(rawData)[ind.dimvars][i]:=allCodes[[i]]]
  }
  setkeyv(fullTabObj, colnames(rawData)[ind.dimvars])
  fullTabObj[,ID:=NULL]

  ## revert rawData codes to default codes
  for (j in seq_along(ind.dimvars)) {
    v <- c_match_default_codes(object=dimObj[[j]], input=rawData[,get(names(dimObj)[j])])
    set(rawData, NULL, names(dimObj)[j], v)
  }
  setkeyv(rawData, colnames(rawData)[ind.dimvars])

  ## replace NAs in rawData by 0 (required for aggregation)
  cols <- colnames(rawData)[(length(dimObj)+1):ncol(rawData)]
  ind.na <- list(); length(ind.na) <- length(cols); k <- 1
  for (j in cols) {
    ind.na[[k]] <- which(is.na(rawData[[j]]))
    set(rawData, ind.na[[k]], j, 0)
    k <- k+1
  }; rm(k)

  ## merge minDat to fullDat
  fullTabObj <- merge(fullTabObj, rawData, all.x=TRUE)

  ## set missing combinations of lowest levels to 0
  ## problematic are all levels that should exist, but do not exist
  ## they are filled with 0 so that we can aggregate
  dim.vars <- colnames(fullTabObj)[ind.dimvars]
  # performance improvement
  cmd <- paste0("fullTabObj[,strID:=paste0(",dim.vars[1])
  if (length(dim.vars)>1) {
    for (i in 2:length(dim.vars)) {
      cmd <- paste0(cmd, ", ",dim.vars[i])
    }
  }
  cmd <- paste0(cmd,")]")
  eval(parse(text=cmd))
  strID <- fullTabObj$strID
  fullTabObj[,strID:=NULL]

  if (length(missing.codes) > 0) {
    index <- which(strID%in%missing.codes)
    for (i in 1:length(cols)) {
      set(fullTabObj, index, cols[i], 0)
    }
  }

  ## fill up missing dimensions
  not.finished <- TRUE

  # which indexvars have any hierarchy (not just the total?)
  # these indiecs specify the dim-variables we loop over
  useInds <- which(sapply(y@dimInfo, function(x) {
    length(x@codesOriginal)>1
  }))

  fullTabObj[,id:=.I]
  cols <- (nrIndexvars+1):(ncol(fullTabObj)-1)
  col.names <- names(fullTabObj)[cols]
  while (not.finished) {
    for (i in useInds) {
      if (length(dim.vars) > 1) {
        setkeyv(fullTabObj, dim.vars[-i])
      } else {
        setkeyv(fullTabObj, dim.vars[1])
      }

      cur.dim <- dimObj[[i]]@dims
      for (j in length(cur.dim):1) {
        cur.levs <-  cur.dim[[j]]
        cmd <- paste0("out <- fullTabObj[",dim.vars[i],"%in% cur.levs[-1],]")
        eval(parse(text=cmd))
        if (length(dim.vars)==1) {
          out <- out[,lapply(.SD,sum), .SDcols=col.names]
        } else {
          out <- out[,lapply(.SD,sum), .SDcols=col.names, by=key(out)]
        }
        cmd <- paste0("row.ind <- fullTabObj[",dim.vars[i],"==cur.levs[1],id]")
        eval(parse(text=cmd))
        for (z in col.names) {
          cmd <- paste0("fullTabObj[id %in% row.ind,",z,":=out[[z]]]")
          eval(parse(text=cmd))
        }
      }
    }
    if (!is.na(fullTabObj[1,ind.freq,with=FALSE])) {
      not.finished <- FALSE
    } else {
      cat("nrMissings:",sum(is.na(fullTabObj$freq)),"\n")
    }
  }
  fullTabObj[,id:=NULL]

  nrV <- nrow(fullTabObj)
  f <- fullTabObj[[ind.freq]]
  strID <- apply(fullTabObj[,dim.vars,with=FALSE],1,paste0, collapse="")

  # performance improvement
  cmd <- paste0("fullTabObj[,strID:=paste0(",dim.vars[1])
  if (length(dim.vars)>1) {
    for (i in 2:length(dim.vars)) {
      cmd <- paste0(cmd, ", ",dim.vars[i])
    }
  }
  cmd <- paste0(cmd,")]")
  eval(parse(text=cmd))
  strID <- fullTabObj$strID
  fullTabObj[,strID:=NULL]

  w <- numVarsList <- NULL
  w.ind <- g_weightvar_ind(x)
  if (!is.null(w.ind)) {
    w <- fullTabObj[[w.ind]]
  }
  n.ind <- g_numvar_ind(x)
  if (!is.null(n.ind)) {
    numVarsList <- list(); length(numVarsList) <- length(n.ind)
    for (n in 1:length(n.ind)) {
      numVarsList[[n]] <- fullTabObj[[n.ind[n]]]
    }
  }

  if (length(n.ind) > 0) {
    names(numVarsList) <- colnames(g_raw_data(x))[n.ind]
  }

  ## replace 0 in rawData by NA if they have been replaced earlier
  for (i in 1:length(ind.na)) {
    if (length(ind.na[[i]]) > 0) {
      set(rawData, ind.na[[i]], cols[i], NA)
    }
  }
  s_raw_data(x) <- list(datO)

  # w are the actual weights as used in the
  # optimization procedure; comput default bounds
  # that depend on these values
  .default_bounds <- function(cost_var) {
    nr_cells <- length(cost_var)
    ll <- list()
    ll$lb <- rep(0, nr_cells)
    ll$ub <- rep(1.5 * max(cost_var), nr_cells)
    ll$LPL <-  rep(1, nr_cells)
    ll$UPL <-  rep(1, nr_cells)
    ll$SPL <-  rep(0, nr_cells)
    ll
  }

  if (!is.null(w)) {
    cost_var <- w
  } else {
    cost_var <- f
  }
  bb <- .default_bounds(cost_var = cost_var)
  problemInstance <- new(
    "problemInstance",
    strID = strID,
    Freq = f,
    w = w,
    numVars = numVarsList,
    lb = bb$lb,
    ub = bb$ub,
    LPL = bb$LPL,
    UPL = bb$UPL,
    SPL = bb$SPL,
    sdcStatus = rep("s", nrV)
  )
  problemInstance@sdcStatus[problemInstance@Freq == 0] <- "z"
  partition <- c_make_partitions(
    input = list(
      objectA = problemInstance,
      objectB = y
    )
  )
  sdcProblem <- new("sdcProblem",
    dataObj = x,
    dimInfo = y,
    problemInstance = problemInstance,
    partition = partition,
    startI = 1,
    startJ = 1,
    indicesDealtWith = NULL,
    elapsedTime = (proc.time() - time.start)[3]
  )
  return(sdcProblem)
})
bernhard-da/sdcTable documentation built on June 10, 2019, 4:54 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
bernhard-da/sdcTable
Methods for Statistical Disclosure Control in Tabular Data

R/methods_multiple_classes.r
In bernhard-da/sdcTable: Methods for Statistical Disclosure Control in Tabular Data

R Package Documentation

Browse R Packages

We want your feedback!

bernhard-da/sdcTable Methods for Statistical Disclosure Control in Tabular Data

R/methods_multiple_classes.r In bernhard-da/sdcTable: Methods for Statistical Disclosure Control in Tabular Data

R Package Documentation

Browse R Packages

We want your feedback!

bernhard-da/sdcTable
Methods for Statistical Disclosure Control in Tabular Data

R/methods_multiple_classes.r
In bernhard-da/sdcTable: Methods for Statistical Disclosure Control in Tabular Data