#' Partition Data
#'
#' @param dat Original data
#' @param QI Quasi-Identifiers
#' @param TA Target Attribute
#' @param ...
#' @return Partitioning Data by Quasi-Identifiers and Target Attribute
#' @examples
#' data(AdultUCI, package='arules')
#' # TA (Target)
#' TA <- c('income')
#' # QI (Quasi-Identifiers)
#' QI <- c('age', 'workclass', 'education', 'marital.status', 'occupation', 'race', 'sex', 'native.country')
#' #QI <- c('age', 'workclass', 'education', 'marital.status')
#' # SA (Sensitive Attributes)
#' SA <- c('capital.gain', 'capital.loss')
#' # IS (Insensitive)
#' IS <- c('relationship', 'hours.per.week', 'education.num')
#' # Remove
#' RM <- c('fnlwgt')
#'
#' parda(AdultUCI, QI, TA)
parda <- function(dat, QI, TA, ...){
# Description : Partition Data (only QI, TA)
#
# Arguments
# dat : Original data
# QI : Quasi-identifiers
# TA : Target Attribute
# Partition data
data.QI <- subset(dat, select = c(QI, TA))
class(data.QI) <- c('parda', 'data.frame')
# Attributes
attr(data.QI, 'QI') <- QI # Quasi-Identifiers
attr(data.QI, 'TA') <- TA # Target Attribute
attr(data.QI, 'nQI') <- length(QI) # number of QI
attr(data.QI, 'nobs') <- nrow(data.QI) # number of obs.
return(data.QI)
}
#' Select a CP values
#'
#' @param n number of cp intervals
#' @param len number of samples
#' @param m number of cp sample matrix rows
#' @param ...
#' @return Selected CP values
cpselec <- function(n = 10, len = 100, m = 2, ...){
# Description : Select cp
#
# Arguments
# n : # of cp intervals
# len : # of samples
# m : # of cp sample matrix rows
# cp값을 랜덤하게 뽑는 과정
cp.vec <- unlist(lapply(seq_len(n), function(x){
# uniform 분포를 이용하여 0부터 10^-x까지의 구간을 len개 생성
# 구간은 n개가 생성이 됨
rx <- runif(n = len, min = 0, max = 10^-x)
# 각 구간마다 1개를 랜덤하게 추출
sample(rx, size = 1)
}))
# 각 구간별 (n개의 구간) (으)로 1개씩 생성해서 n개의 cp값을 리턴
# 위에서 뽑은 cp값에 대해서 m개의 cp값을 임의로 추출
cp.val <- sample(cp.vec, size = m)
class(cp.val) <- class('cpselec')
# Attributes
attr(cp.val, 'ncp') <- length(cp.val)
return(cp.val)
}
#' Classing QI by \code{rpart} function
#'
#' @param dat \code{parda} object
#' @param cp Cost Complexity Parameter. \code{cpselec} object
#' @param loss.mat Loss matrix
#' @param cval number of cross-validations
#' @param mc if TRUE \code{mclapply} applied, FALSE \code{lapply} applied (default).
#' @param ...
#' @return Classed a Quasi-Identifiers by CP using Entropy
rpaclass <- function(dat, cp = 0.01, loss.mat = matrix(c(0,1,2,0), ncol=2), cval=10, mc = FALSE, ...){
# Description : QI classing by cp
#
# Arguments
# dat : parda Data Object
# cp : cp value (matrix) - results of `cpselec`
# loss.mat : loss matrix
# cval : # of cross-validation
# mc : if TRUE (use mclapply), FALSE (default, use lapply)
# 필요 패키지 불러오기
require(rpart)
# Recursive Partitioning and Regression Trees
require(pbapply)
# Adding Progress Bar to '*apply' Functions
# mclapply.hack.R sourcing
source('http://www.stat.cmu.edu/~nmv/setup/mclapply.hack.R')
# `parda`에서 return된 object의 attr을 가져옴
ta <- attr(dat, 'TA') # TA
qi <- attr(dat, 'QI') # QI
# lapply (mc = FALSE)
if(mc == FALSE){
# rpart result by cp by QI
# cp값 마다 function 수행
cp.res <- pblapply(cp, function(x){
# qi 마다 function 수행
pblapply(qi,
function(var){
# Formula
Fmla <- as.formula(paste(ta, '~', var))
# factor level에 따라서 rpart를 통해 classing을 할 것인지 말 것인지 판단
# QI factor level >= 5
if(nlevels(as.factor(dat[, var])) >= 5){
# rpart 함수 적용하여 QI classing 수행
rpart(Fmla,
data = dat,
method = 'class',
parms = list(loss = loss.mat, split = 'information'),
control = rpart.control(cp = x, xval = cval))
}
else {'None'}
# QI factor level < 5
# 이 때는 'None'이라고 결과값을 리턴
})
})
}
# mclapply (mc = TRUE)
if(mc == TRUE){
# rpart result by cp by QI
# cp값 마다 function 수행
cp.res <- mclapply(cp, function(x){
# qi 마다 function 수행
mclapply(qi,
function(var){
# Formula
Fmla <- as.formula(paste(ta, '~', var))
# factor level에 따라서 rpart를 통해 classing을 할 것인지 말 것인지 판단
# QI factor level >= 5
if(nlevels(as.factor(dat[, var])) >= 5){
# rpart 함수 적용하여 QI classing 수행
rpart(Fmla,
data = dat,
method = 'class',
parms = list(loss = loss.mat, split = 'information'),
control = rpart.control(cp = x, xval = cval))
}
else {'None'}
# QI factor level < 5
# 이 때는 'None'이라고 결과값을 리턴
})
})
}
class(cp.res) <- c('rpaclass', 'list')
# Attributes 추가
attr(cp.res, 'Call') <- match.call()
attr(cp.res, 'n') <- attr(dat, 'nobs') # nobs.
attr(cp.res, 'cp') <- cp # cp values
attr(cp.res, 'QI') <- qi # QI
attr(cp.res, 'nQI') <- attr(dat, 'nQI') # QI 갯수
attr(cp.res, 'ncp') <- attr(cp, 'ncp') # cp값 갯수
return(cp.res)
}
#' Create a Lattice Node
#'
#' @param obj \code{rpaclass} object
#' @param mc if TRUE \code{mclapply} applied, FALSE \code{lapply} applied (default).
#' @param ...
#' @return Created Lattice Node using result of QI Classing
lanode <- function(obj, mc = FALSE, ...){
# Description : Lattice Node
#
# Arguments
# obj : cpctrl object
# mc : if TRUE (use mclapply), FALSE (default, use lapply)
require(pbapply)
# Adding Progress Bar to '*apply' Functions
# mclapply.hack.R sourcing
source('http://www.stat.cmu.edu/~nmv/setup/mclapply.hack.R')
qi <- attr(obj, 'QI')
cp <- attr(obj, 'cp')
n.qi <- attr(obj, 'nQI')
n.cp <- attr(obj, 'ncp')
# cp값에 따른 QI별로 가능한 조합 수
# cp값 개수 ^ QI 개수
lattice.pairs <- expand.grid(rep(list(seq_len(n.cp)), n.qi))
# expand.grid : Create a Data Frame from All Combinations of Factors
lattice.node <- list()
# 비어있는 list를 만듦
length(lattice.node) <- nrow(lattice.pairs)
# list의 length를 lattice.pairs의 nrow로 지정
# lapply (mc = FALSE)
if(mc == FALSE){
lattice.node <- pblapply(seq_len(nrow(lattice.pairs)), function(lat){
pblapply(seq_len(ncol(lattice.pairs)), function(no){
ind <- lattice.pairs[lat, no]
obj[[ind]][[no]]})
})
}
# mclapply (mc = TRUE)
if(mc == TRUE){
lattice.node <- mclapply(seq_len(nrow(lattice.pairs)), function(lat){
mclapply(seq_len(ncol(lattice.pairs)), function(no){
ind <- lattice.pairs[lat, no]
obj[[ind]][[no]]
}, num = mcs)
})
}
# class 지정
class(lattice.node) <- c('lanode', 'list')
attr(lattice.node, 'lattice_pairs') <- lattice.pairs
# lattice node의 generalization hierarchy
# Attributes 추가
attr(lattice.node, 'QI') <- qi # QI
attr(lattice.node, 'cp') <- cp # cp values
attr(lattice.node, 'nQI') <- attr(obj, 'nQI') # QI 갯수
attr(lattice.node, 'ncp') <- attr(obj, 'ncp') # cp 값의 갯수
attr(lattice.node, 'nnode') <- length(lattice.node) # lattice node의 갯수
return(lattice.node)
}
#' Quasi-Identifiers Grouping
#'
#' @param dat \code{parda} object
#' @param obj \code{lanode} object
#' @param ...
#' @return Grouped Data by each lattice node
qigrp <- function(dat, obj, mcs = 4, ...){
# Description : QI grouping!
#
# Arguments
# dat : (QI, TA) subest data
# obj : lanode object
require(mgcv) ## uniquecombs() : find the unique rows in a matrix
# mclapply.hack.R sourcing
source('http://www.stat.cmu.edu/~nmv/setup/mclapply.hack.R')
# Data to be grouped
data.QI.GH <- dat
# QI Class 확인
# Numeric : TRUE, Factor : FALSE
qi.class <- unlist(lapply(names(data.QI.GH[, -ncol(data.QI.GH)]),
function(x) is.numeric(data.QI.GH[, x])))
#### QI grouping
data.QI.GH.res <- mclapply(1:length(obj), function(x){
for(i in 1:length(qi.class)){
# CASE I : Numeric QI
if(qi.class[i] == T){
# rpart result : EXIST
if(class(obj[[x]][[i]]) == 'rpart' && class(obj[[x]][[i]]$splits) == 'matrix'){
split.point <- obj[[x]][[i]]$splits[, 4]
# rpart object의 splits value를 통해 classing되는 구간을 찾음
data.QI.GH[, i] <- cut(data.QI.GH[, i], c(0, split.point, Inf))
# cut() function을 통해 위에 split.point 결과를 가지고 grouping
}
# rpart result : NOT ("None", "root[1]")
# 특히, `root[1]`로 나타나는 경우에는 TA에 대한 설명력이 없다는 의미임
else{
data.QI.GH[, i] <- as.numeric(data.QI.GH[, i])
}
cat('\n Numeric QI generalization >> \n')
}
# CASE II : Factor QI
if(qi.class[i] != T){
data.QI.GH[, i] <- as.character(data.QI.GH[, i])
# rpart result : EXIST
if(class(obj[[x]][[i]]) == 'rpart' && class(obj[[x]][[i]]$csplit) == 'matrix'){
split.point <- t(obj[[x]][[i]]$csplit)
# terminal node로 향하는 split 방향에 대한 matrix
grp.mat <- uniquecombs(split.point) # terminal node
grp.ind <- attr(grp.mat, 'index') # terminal node index
grp.ind.nlevels <- nrow(grp.mat) # terminal node level 갯수
grp.ind.levels <- levels(as.factor(grp.ind)) # terminal node factor level
# grouping levels
dat.levels <- levels(as.factor(data.QI.GH[, i]))
# 위 결과에 의해 factor level의 QI를 grouping하는 과정
for(j in 1:grp.ind.nlevels){
data.QI.GH[data.QI.GH[, i] %in% dat.levels[grp.ind == j], i] <- j
}
data.QI.GH[, i] <- as.factor(data.QI.GH[, i])
}
# rpart result : NOT ("None", "root[1]")
else{
data.QI.GH[, i] <- as.factor(data.QI.GH[, i])
}
cat('\n Factor QI generalization >> \n')
}
cat(paste('\n\n\n QI - ', i, ' : Completed >> \n\n\n'))
}
data.QI.GH
})
# class 지정
class(data.QI.GH.res) <- c('qigrp', 'list')
# attributes 추가
attr(data.QI.GH.res, 'QI') <- attr(obj, 'QI') # QI
attr(data.QI.GH.res, 'cp') <- attr(obj, 'cp') # cp values
attr(data.QI.GH.res, 'nnode') <- attr(obj, 'nnode') # 노드의 갯수
return(data.QI.GH.res)
}
#' Equivalence Class
#'
#' @param dat \code{qigrp} object
#' @param QI Quasi-Identifiers
#' @param TA Target Attribute
#' @param ...
#' @return Equivalence Class and Size
eclass <- function(dat, QI, TA, ...){
# Description : Equivalence Class and Size
#
# Arguments
# dat : QIgrouping Data Object
# QI : QI
# TA : TA
# 필요 패키지 불러오기
require(reshape2)
require(pbapply)
equi.res <- pblapply(1:length(dat), function(x){
data.QI.sort <- dat[[x]][order(dat[[x]][, QI]), ]
# QI grouping 결과 데이터를 QI마다 ordering을 시킴
# 즉, equivalence class끼리 볼 수 있도록 하는 것임
rownames(data.QI.sort) <- NULL
# equivalence class size 산출
equi.class <- melt(data.QI.sort,
id.vars = QI,
measure.vars = TA)
equi.class <- dcast(equi.class, ... ~ variable, length)
colnames(equi.class)[ncol(equi.class)] <- 'equi.size'
attr(equi.class, 'NequiClass') <- nrow(equi.class)
attr(equi.class, 'equi.size') <- equi.class$equi.size
equi.class
})
# class 지정
class(equi.res) <- c('eclass', 'list')
# Attributes 추가
attr(equi.res, 'n') <- length(equi.res) # equivalence class의 갯수
return(equi.res)
}
#' Suppression and Information Loss Metrics
#'
#' @param dat \code{eclass} object
#' @param kanon k of k-Anonymity
#' @param maxsup threshold of k
#' @param ...
#' @return Suppression and Information Loss Metrics. Information Loss Metrics are Discernbility Metric (DM) and Entropy
resume <- function(dat, kanon = 3, maxsup = 0.01, ...){
# Description : Suppression & Information Loss Computation
#
# Arguments
# dat : equiClass Data Object
# kanon : k of k-Anonymity (default: 3)
# maxsup : default (0.01), 0.05 or 0.1
# 필요 패키지 불러오기
require(pbapply)
require(plyr)
# Suppression
supp.res <- unlist(pblapply(1:length(dat), function(x){
supp <- sum(dat[[x]]$equi.size < kanon) / sum(dat[[x]]$equi.size)
supp
}))
# Information Loss
loss.res <- pblapply(1:length(dat), function(x){
# Discernibility Metric
DM <- sum(dat[[x]]$equi.size[which(dat[[x]]$equi.size >= 3)]^2) +
sum(sum(dat[[x]]$equi.size) * dat[[x]]$equi.size[which(dat[[x]]$equi.size < 3)])
# Entropy
ENTROPY <- -sum(dat[[x]]$equi.size * log2(dat[[x]]$equi.size / sum(dat[[x]]$equi.size)))
c(DM, ENTROPY)
})
loss.res <- ldply(loss.res) # list to data.frame
names(loss.res) <- c('DM', 'ENTROPY') # column names
#
resume.res <- data.frame(supp = supp.res,
supp_rank = factor(rank(supp.res)),
optimal = ifelse(supp.res < maxsup, '*', ''),
DM = loss.res$DM,
DM_rank = factor(rank(loss.res$DM)),
Entropy = loss.res$ENTROPY,
Ent_rank = factor(rank(loss.res$ENTROPY)))
# Class 지정
class(resume.res) <- c('resume', 'data.frame')
# Attributes 추가
attr(resume.res, 'kanon') <- which(resume.res$optimal == '*') # k-anonymous를 만족하는 노드의 row number
attr(resume.res, 'n_supprank') <- unique(resume.res$supp_rank) # suppression rank의 unique()
attr(resume.res, 'n_dmrank') <- unique(resume.res$DM_rank) # DM rank의 unique()
attr(resume.res, 'n_entrank') <- unique(resume.res$Ent_rank) # Entropy rank의 unique()
return(resume.res)
}
#' Plot
#'
#' @param obj \code{resume} object
#' @param ...
#' @return Plot
plot.resume <- function(obj, ...){
require(ggplot2)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
# suppression
s <- ggplot(data = obj, aes(x = supp_rank, y = supp)) + geom_line(aes(group=1)) + geom_point()
# DM
d <- ggplot(data = obj, aes(x = DM_rank, y = DM)) + geom_line(aes(group=1)) + geom_point()
# Entropy
e <- ggplot(data = obj, aes(x = Ent_rank, y = Entropy)) + geom_line(aes(group=1)) + geom_point()
multiplot(s, d, e)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.