Nothing
######################################################################
## CODE AND DOCUMENTATION FOR PACKAGE sdcTarget
######################################################################
#' @name sdcTarget-package
#' @docType package
#' @import methods magic foreach parallel doParallel tuple
#' @description
#' Provides functions to calculate target matrices for
#' statistical disclosure control.
#' @details
#' Parallel processing is now implemented by default
#' to speed up large searches. The number of processors
#' must be specified. If not specified, will default
#' to linear processing.
#' @references
#' \href{http://www.statslife.org.uk/events/annual-conference/conference-blog/1752-data-linkage-and-privacy}{Summary of talk at RSS 2014 Conference}
#'
NULL
#' @title
#' Hashing Definition (S4 Class)
#' @description
#' This class defines the data hash that is used to identify
#' cells to be targeted for statistical disclosure control.
#' @details
#' The hashing definition presently handles only categorical
#' fields, which limits the applicability of this software
#' accordingly.
#' @slot parts
#' The number of hashing components.
#' @slot coverage
#' The number of elements in each hashing component.
#' @slot lengths
#' The number of levels of each field to be hashed.
#' @slot na.exists
#' Whether there is an NA value in each field to be hashed.
#' @slot na.recode
#' Whether NA values should be treated as levels.
#' @slot levels
#' The number of levels of each field to be hashed.
#' @examples
#' new("sdcHashingDefinitionClass")
#' @export
setClass("sdcHashingDefinitionClass",
slots = c(parts = "integer",
coverage = "integer",
lengths = "integer",
na.exists = "logical",
na.recode = "logical",
levels = "list"),
prototype = list(parts = integer(0),
coverage = integer(0),
lengths = integer(0),
na.exists = logical(0),
na.recode = TRUE,
levels = vector(mode = "list", length = 0) ),
validity = function(object) {
identical(length(coverage), parts) &
identical(length(lengths), length(levels)) &
identical(length(lengths), length(na.exists)) &
identical(length(na.recode), length(1)) &
identical(names(levels), names(lengths)) &
identical(names(levels), names(na.exists))
} )
#' @describeIn sdcHashingDefinitionClass
#' @param .Object
#' A \code{\linkS4class{sdcHashingDefinitionClass}} object.
#' @param ...
#' The optional parameters specifying the data
#' and whether \code{NA} values are to be recoded and hashed.
#' Recoding is \code{TRUE} by default.
#' @examples
#' new("sdcHashingDefinitionClass")
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' new("sdcHashingDefinitionClass", X = my.X)
#' new("sdcHashingDefinitionClass", X = my.X, na.recode = FALSE)
#' @export
setMethod("initialize", "sdcHashingDefinitionClass",
function(.Object, ...) {
# Get any specified constructor elemnts
my.names <- names(match.call(expand.dots = TRUE))
# Get the data, if any
my.tmp <- pmatch(my.names, c("x", "data", "X"))
if(sum(!is.na(my.tmp)) > 1)
stop("Specify only one dataset on which the hashing is to be defined.")
my.data_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.data <- NULL
if(my.data_index)
my.data <- eval(parse(text = match.call()[my.names[my.data_index]]))
if(!is.null(my.data) && !is.data.frame(my.data))
stop("The data should be in a data frame.")
# Check if recoding is specified
my.tmp <- pmatch(my.names, c("na.recode"))
my.na_recode_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.na_recode <- TRUE
if(my.na_recode_index)
my.na_recode <- eval(parse(text = match.call()[my.names[my.na_recode_index]]))
if(!is.logical(my.na_recode) || length(my.na_recode) != 1)
stop("Recoding should be specified as a single logical value.")
.Object@na.recode <- my.na_recode
if(!is.null(my.data)) {
.Object@levels <- lapply(lapply(my.data, unique), sort)
.Object@na.exists <- sapply(my.data, function(XX) { any(is.na(XX)) })
.Object@lengths <- sapply(.Object@levels, length) + .Object@na.recode
my.tmp1 <- as.integer(rep(1, length(.Object@lengths)+1))
my.number <- 1
for(j in seq(.Object@lengths)) {
my.tmp1[j+1] <- suppressWarnings(.Object@lengths[j]*my.tmp1[j])
if(is.na(my.tmp1[j+1])) {
my.number <- my.number + 1
my.tmp1[j+1] <- .Object@lengths[j] }
}
.Object@parts <- as.integer(my.number)
my.tmp2 <- my.tmp1[-1] > shift(my.tmp1[-1], -1)
my.tmp2[length(my.tmp2)] <- TRUE
my.tmp3 <- unique((seq(my.tmp1[-1]))*my.tmp2)
.Object@coverage <- (my.tmp3 - shift(my.tmp3, 1))[-1]
}
return(invisible(.Object))
})
#' @title
#' S4 Hash Class
#' @description
#' This class defines the data hash that is used to identify
#' cells to be targeted for statistical disclosure control.
#' @details
#' The hashing definition presently handles only categorical
#' fields, which limits the applicability of this software
#' accordingly.
#' @slot .Data
#' A matrix.
#' @slot Hdef
#' The hashing definition on which the
#' \code{\linkS4class{sdcHashClass}} object is built.
#' @slot forward
#' Logical indicating whether a forward hash or a backward
#' hash, in terms of field order, has been used.
#' @slot hash
#' A character string containing one hash representation
#' for each data record.
#' @examples
#' new("sdcHashClass")
#' @export
setClass("sdcHashClass",
contains = c("matrix"),
slots = c(Hdef = "sdcHashingDefinitionClass",
forward = "logical",
hash = "character"),
prototype = prototype(Hdef = new("sdcHashingDefinitionClass"),
forward = FALSE,
hash = character(0)),
validity = function(object) {
TRUE
} )
#' @describeIn sdcHashClass
#' @details
#' Does not currently permit use of an arbitrary hashing definition.
#' @param .Object
#' A \code{\linkS4class{sdcHashingDefinitionClass}} object.
#' @param ...
#' The optional parameters specifying the data
#' and other options.
#' @examples
#' new("sdcHashClass")
#' my.X <- data.frame(matrix(ifelse(runif(5000)>.5, TRUE, FALSE), ncol = 50))
#' new("sdcHashClass", X = my.X)
#' new("sdcHashClass", X = my.X, na.recode = FALSE, which = 2:4,
#' forwardHashing = TRUE)
#' @export
setMethod("initialize", "sdcHashClass",
function(.Object, ...) {
# Get any specified constructor elemnts
my.names <- names(match.call(expand.dots = TRUE))
# Get the data, if any
my.tmp <- pmatch(my.names, c("x", "data", "X"))
if(sum(!is.na(my.tmp)) > 1)
stop("Specify only one dataset on which the hashing is to be defined.")
my.data_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.data <- NULL
if(my.data_index)
my.data <- eval(parse(text = match.call()[my.names[my.data_index]]))
if(!is.null(my.data) && !is.data.frame(my.data))
stop("The data should be in a data frame.")
# Check if forward hashing is specified
my.tmp <- pmatch(my.names, c("forwardHash", "forward.hash"))
my.forward_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.forward <- FALSE
if(my.forward_index)
my.forward <- eval(parse(text = match.call()[my.names[my.forward_index]]))
if(!is.logical(my.forward) || length(my.forward) != 1)
stop("Forward hashing should be specified as a single logical value.")
.Object@forward <- my.forward
if(!is.null(my.data)) {
# Identify the fields to be hashed
my.tmp <- pmatch(my.names, c("which", "fields", "variables"))
my.fields_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
# Work with fields by name
my.fields <- names(my.data)
if(my.fields_index)
my.fields <- eval(parse(text = match.call()[my.names[my.fields_index]]))
if(is.numeric(my.fields) && length(my.fields) < ncol(my.data))
my.fields <- names(my.data)[my.fields]
if(!is.character(my.fields))
stop("Fields for hashing should be specified by name or index.")
if(any(my.fields %!in% names(my.data)))
stop(paste("Unidentified fields:",
paste(setdiff(my.fields, names(my.data)), collapse = " ")))
# Convert to a logical indicator of fields to be hashed
my.field_last <- 0
my.fields <- ifelse(seq(ncol(my.data)) %in% match(my.fields, names(my.data)),
TRUE, FALSE)
my.field_last <- max(which(my.fields))
# Get the hashing definition object
.Object@Hdef <- new("sdcHashingDefinitionClass", ...)
# Prepare the vectors that will accept the calculated hashes
# * calculate greatest hash bin
my.bin_max <- min(seq(.Object@Hdef@parts)[
my.field_last <= cumsum(.Object@Hdef@coverage)])
# * create a list entry for each necessary hash bin
my.list <- vector("list", my.bin_max)
my.list_names <- vector("character", my.bin_max)
i <- 0 # Hash base index
# * loop over the hash bins
for(j in seq(my.bin_max)) {
# - get relevant lengths (number of category values per variable)
my.lengths <- .Object@Hdef@lengths[(i+1):min(my.field_last,
.Object@Hdef@coverage[j]+i)]
# - get the subset of fields over which to calculate, this being
# a vector of logicals that is used as a multiplier in the hash
# calculation, so that only the specified fields are included
my.fields_subset <- my.fields[(i+1):min(my.field_last,
.Object@Hdef@coverage[j]+i)]
# - calculate the multipliers per variable
if(.Object@forward) my.multipliers <- c(1, my.lengths[-length(my.lengths)])
else my.multipliers <- c(1, rev(my.lengths[-1]))
# - calculate hash in the event that it includes more than one variable
if((my.field_last-i-1)>0) {
if(.Object@forward)
my.list[[j]] <- apply(my.data[,(i+1):min(my.field_last,
.Object@Hdef@coverage[j]+i)], 1,
function(XX, YY = cumprod(my.multipliers),
ZZ = my.fields_subset) {
sum(ZZ*as.numeric(XX)*YY) } )
else my.list[[j]] <- apply(my.data[,(i+1):min(my.field_last,
.Object@Hdef@coverage[j]+i)], 1,
function(XX, YY = rev(cumprod(my.multipliers)),
ZZ = my.fields_subset) {
sum(ZZ*as.numeric(XX)*YY) } )
} else my.list[[j]] <- my.data[, my.field_last]
# - name the hash bin
my.list_names[j] <- paste("Hash", j, sep = "")
# - increment the variable index to start the next hash bin
i <- i + .Object@Hdef@coverage[j]
}
# * concatenate the hash bins to create the overall hash
if(my.bin_max>1) {
eee <- my.list[[1]]
for(j in 2:my.bin_max) eee <- cbind(eee, my.list[[j]])
.Object@hash <- apply(eee, 1, function(XX) {
if(any(is.na(XX))) return(NA)
else return(paste(XX, collapse="X")) })
} else .Object@hash <- as.character(my.list[[1]])
# Create and label the matrix
.Object@.Data <- do.call(cbind, my.list)
if(is.null(rownames(my.data)))
rownames(.Object) <- 1:nrow(my.data)
else rownames(.Object) <- rownames(my.data)
colnames(.Object) <- my.list_names
}
return(invisible(.Object))
})
#' @title
#' Get a Hash List Subset
#' @description
#' Returns a subset of a \code{\linkS4class{sdcHashClass}} object.
#' @details
#' Does what it says on the package.
#' @param x
#' An \code{\linkS4class{sdcHashClass}} object.
#' @param i
#' An index of the hash.
#' @examples
#' my.X <- data.frame(matrix(ifelse(runif(5000)>.5, TRUE, FALSE), ncol = 50))
#' my.hc <- new("sdcHashClass", X = my.X)
#' my.hc[2:3]
#' @seealso \code{\linkS4class{sdcHashClass}},
#' @export
setMethod("[", signature = "sdcHashClass",
function(x, i) {
if(length(x@Hdef@parts) == 0)
warning("Empty sdcHashClass object. No subset taken.")
else {
if(any(i %!in% seq(x@Hdef@parts)))
stop(paste("Subset index not within 1:",
x@Hdef@parts, sep=""))
x@.Data <- x@.Data[, i, drop = FALSE]
x@hash <- apply(x@.Data, 1, paste, collapse = "X")
my.in <- rep(ifelse(seq(x@Hdef@parts) %in% i, TRUE, FALSE),
x@Hdef@coverage)
x@Hdef@levels <- x@Hdef@levels[which(my.in)]
x@Hdef@na.exists <- x@Hdef@na.exists[which(my.in)]
x@Hdef@lengths <- x@Hdef@lengths[which(my.in)]
x@Hdef@coverage <- x@Hdef@coverage[i]
x@Hdef@parts <- length(i)
}
return(invisible(x))
})
#' @describeIn show
#' @title
#' Show Method For \code{\linkS4class{sdcHashClass}} Objects
#' @description
#' Returns an abbreviated summary of a \code{\linkS4class{sdcHashClass}} object.
#' @details
#' As described in the documentation for \code{\link[methods]{show}},
#' the default for printing out S4 objects is a call to
#' a \code{\link[methods]{show}} method, whereas
#' \code{\link[base]{print}} is the default for S3 objects. This gives
#' the possibility to have an alternative way to print the contents
#' of a user-defined S4 class object.
#' @examples
#' show(new("sdcHashClass"))
#' @export
setMethod("show", signature = "sdcHashClass",
function(object) {
return(invisible(NULL))
})
#' @describeIn show
#' @description
#' The summary method runs and returns the output of
#' some calculations on an \code{\linkS4class{sdcHashClass}}
#' object, together with an abbreviated summary of the
#' object itself.
#' @param object
#' A \code{\linkS4class{sdcHashClass}} object.
#' @examples
#' summary(new("sdcHashClass"))
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.hc <- new("sdcHashClass", X = my.X)
#' summary(my.hc)
#' @export
setMethod("summary", signature = "sdcHashClass",
function(object) {
# # Look at the inverse hash within one or more cells to check the data
# # X: A dataset (standardized or not)
# # Y: A hash list derived from the dataset X
# get5TablesByHash <- function(X, Y) {
# ccc <- Y@hash[duplicated(Y@hash)] # Get the duplicate hash values
# ddd <- sample.int(length(ccc), 5) # Identify 5 cells at random
# for(i in ddd) {
# eee <- (1:length(Y@hash))[Y@hash==ccc[i]]
# if(verbose) {
# cat(paste("Data for Hash ", Y@hash[i], ":\n", sep=""))
# print(X[eee,])
# }
# }
# }
Hdef <- object@Hdef
cellsX <- table(object@hash)
cat(paste("Number of Fields in Hash:", sum(Hdef@coverage)))
cat(paste("\nNumber of Hash Parts:", Hdef@parts))
cat(paste("\nNumber of Fields in Each Hash Part:", paste(Hdef@coverage, collapse=" ")))
cat(paste("\nNumber of Records Hashed:", length(object@hash),
"(check)", sum(cellsX),
"(check uniques plus duplicates)",
length(unique(object@hash)) + sum(duplicated(object@hash))))
cat(paste("\nNumber of Records in Cells With >=2 Records:",
sum(duplicated(object@hash) | duplicated(object@hash, fromLast=TRUE)),
"(check 1)", 2*length(unique(object@hash[duplicated(object@hash)])) +
sum(duplicated(object@hash[duplicated(object@hash)])),
"(check 2)", sum(cellsX[cellsX>1])))
cat(paste("\nNumber of Records in Cells With >=3 Records:",
3*length(unique(object@hash[triplicated(object@hash)])) +
sum(duplicated(object@hash[triplicated(object@hash)])),
"(check)", sum(cellsX[cellsX>2])))
cat(paste("\nNumber of Cells With >=2 Records:",
length(unique(object@hash[duplicated(object@hash)])),
"(check)", length(cellsX[cellsX>1])))
cat(paste("\nNumber of Cells With >=3 Records:",
length(unique(object@hash[triplicated(object@hash)])),
"(check)", length(cellsX[cellsX>2])))
cat(paste("\nNumber of Cells With One Record Alone:",
sum(1-(duplicated(object@hash) |
duplicated(object@hash, fromLast=TRUE))),
"(check)", length(cellsX[cellsX==1])))
my.retval <- table(cellsX)
cat("\nEmpirical Distribution of Cell Size:\n")
invisible(return(my.retval))
})
#' @name sdcProcessCells
#' @title
#' Identify or Remove Small or Large Cells From a Hash List
#' @description
#' Returns a hash list identifying the appropriate cells, or a
#' subset of the data provided with the specified cells removed.
#' @param Y
#' A hash list as returned by \code{\linkS4class{sdcHashClass}}.
#' @param X
#' A data frame corresponding to \code{Y}.
#' @param cutoff
#' The lower or upper cutoff of cell size.
#' @keywords big cells, small cells, identify, remove
#' @seealso \code{\linkS4class{sdcHashClass}},
NULL
#' @rdname sdcProcessCells
#' @examples
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.hc <- new("sdcHashClass", X = my.X)
#' identifySmallCells(my.hc)
#' @export
identifySmallCells <- function(Y, cutoff=1) {
cellsY <- table(Y@hash) # Note that by default NA's are excluded here
setdiff(Y@hash, Y@hash[match(names(cellsY[cellsY>cutoff]), Y@hash)])
}
#' @rdname sdcProcessCells
#' @examples
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.hc <- new("sdcHashClass", X = my.X)
#' identifyBigCells(my.hc)
#' @export
identifyBigCells <- function(Y, cutoff=1) {
cellsY <- table(Y@hash) # Note that by default NA's are excluded here
Y@hash[match(names(cellsY[cellsY>cutoff]), Y@hash)]
}
#' @rdname sdcProcessCells
#' @examples
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.hc <- new("sdcHashClass", X = my.X)
#' identifySmallCellRecords(my.hc)
#' @export
identifySmallCellRecords <- function(Y, cutoff=1)
matchAll(identifySmallCells(Y, cutoff=cutoff), Y@hash)
#' @rdname sdcProcessCells
#' @examples
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.hc <- new("sdcHashClass", X = my.X)
#' identifyBigCellRecords(my.hc)
#' @export
identifyBigCellRecords <- function(Y, cutoff=1)
matchAll(identifyBigCells(Y, cutoff=cutoff), Y@hash)
#' @rdname sdcProcessCells
#' @examples
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.hc <- new("sdcHashClass", X = my.X)
#' removeSmallCellRecords(my.X, my.hc)
#' @export
removeSmallCellRecords <- function(X, Y, cutoff=1)
X[identifyBigCellRecords(Y, cutoff=cutoff),]
#' @rdname sdcProcessCells
#' @examples
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.hc <- new("sdcHashClass", X = my.X)
#' removeBigCellRecords(my.X, my.hc)
#' @export
removeBigCellRecords <- function(X, Y, cutoff=1)
X[identifySmallCellRecords(Y, cutoff=cutoff),]
#' @title
#' S4 Target Definition
#' @description
#' The SDC target definition is used to calculate a target matrix.
#' @details
#' The hashing definition presently handles only categorical
#' fields, which limits the applicability of this software
#' accordingly.
#' @slot dim
#' The dimensions of the data on which the target definition
#' is calculated.
#' @slot dimnames
#' A list of row and column names of the data on which the
#' target definition is calculated, in the form returned
#' by \code{\link[base]{dim}}.
#' @slot cutoff
#' The minimum residual cell size.
#' @slot rownames
#' The row names of the target elements.
#' @slot size
#' The size of each target element.
#' @slot index
#' The index of each target element.
#' @slot hash
#' The target hash vector.
#' @examples
#' new("sdcTargetDefinitionClass")
#' @export
setClass("sdcTargetDefinitionClass",
slots = c(dim = "integer",
dimnames = "list",
cutoff = "integer",
rownames = "character",
size = "integer",
index = "character",
hash = "character"),
prototype = list(dim = as.integer(c(0, 0)),
dimnames = list(character(0),
character(0)),
cutoff = as.integer(1),
rownames = character(0),
size = integer(0),
index = character(0),
hash = character(0) ),
validity = function(object) {
TRUE
} )
#' @describeIn sdcTargetDefinitionClass
#' @param .Object
#' An \code{\linkS4class{sdcTargetDefinitionClass}} object.
#' @param ...
#' The optional parameters specifying the data
#' and other options.
#' Among these are
#' \tabular{ll}{
#' Name \tab Description \cr
#' X \tab A data frame with the data, not necessarily \cr
#' \tab standardised. \cr
#' Hdef \tab A hashing definition, if generated externally, \cr
#' \tab perhaps because it comes from a larger set of data. \cr
#' cutoff \tab The minimum residual cell size. \cr
#' }
#' @examples
#' set.seed(256)
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' new("sdcTargetDefinitionClass", X = my.X)
#' @seealso \code{\linkS4class{sdcTargetMatrixClass}},
#' @export
setMethod("initialize", "sdcTargetDefinitionClass",
function(.Object, ...) {
# Variables starting with X derive from the data.
# Variables starting with H relate to the hashes.
#
getStandardData <- function (X, H) {
ddd <- X
if(!all(class(X) %in% "factor"))
ddd <- data.frame(lapply(X, function(Z) { factor(Z) }))
my.special <- matchNone(ddd)
for(i in seq(ddd)) {
if(class(ddd[[i]]) %in% "factor") {
for(j in seq(H@levels[[i]]))
levels(ddd[[i]])[levels(ddd[[i]]) == H@levels[[i]][j]] <-
paste(my.special, j, sep = "")
levels(ddd[[i]]) <- substr(levels(ddd[[i]]),
start = nchar(my.special) + 1,
stop = max(nchar(levels(ddd[[i]]))))
}
if(H@na.recode)
levels(ddd[[i]])[is.na(X[[i]])] <- 0
}
ddd
}
# Get any specified constructor elements
my.names <- names(match.call(expand.dots = TRUE))
# Get the data
my.tmp <- pmatch(my.names, c("x", "data", "X"))
if(sum(!is.na(my.tmp)) > 1)
stop("Specify only one dataset on which the hashing is to be defined.")
my.data_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.data <- NULL
if(my.data_index)
my.data <- eval(parse(text = match.call()[my.names[my.data_index]]))
if(!is.null(my.data) && !is.data.frame(my.data))
stop("The data should be in a data frame.")
# Check if recoding is specified
my.tmp <- pmatch(my.names, c("na.recode"))
my.na_recode_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.na_recode <- TRUE
if(my.na_recode_index)
my.na_recode <- eval(parse(text = match.call()[my.names[my.na_recode_index]]))
if(!is.logical(my.na_recode) || length(my.na_recode) != 1)
stop("Recoding should be specified as a single logical value.")
# Get the hashing definition
my.tmp <- pmatch(my.names, c("hdef", "Hdef"))
if(sum(!is.na(my.tmp)) > 1)
stop("Specify only one hashing definition.")
my.Hdef_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.Hdef <- NULL
if(my.Hdef_index)
my.Hdef <- eval(parse(text = match.call()[my.names[my.Hdef_index]]))
else my.Hdef <- new("sdcHashingDefinitionClass",
X = my.data, na.recode = my.na_recode)
if(!is(my.Hdef, "sdcHashingDefinitionClass"))
stop("Hashing definition must be of class sdcHashDefinitionClass.")
# Check if cutoff is specified
my.tmp <- pmatch(my.names, c("cutoff"))
my.cutoff_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.cutoff <- 1
if(my.cutoff_index)
my.cutoff <- eval(parse(text = match.call()[my.names[my.cutoff_index]]))
if(!is.numeric(my.cutoff) || length(my.cutoff) != 1)
stop("Cutoff should be specified as a single integer value.")
my.cutoff <- as.integer(my.cutoff)
# Check if maximum number of cores to use is specified
my.tmp <- pmatch(my.names, c("cores", "processors", "threads"))
my.cores_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.cores <- 1
if(my.cores_index)
my.cores <- eval(parse(text = match.call()[my.names[my.cores_index]]))
if(!is.numeric(my.cores) || length(my.cores) != 1)
stop("Number of cores should be specified as a single integer value.")
my.cores <- as.integer(my.cores)
# Check if debug output is needed
my.tmp <- pmatch(my.names, c("debugging"))
my.debug_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.debug <- FALSE
if(my.debug_index)
my.debug <- eval(parse(text = match.call()[my.names[my.debug_index]]))
if(!is.logical(my.debug) || length(my.debug) != 1)
stop("Debugging output should be specified as a single logical value.")
# Get the global temporary directory (used in debugging)
my.tempdir <- tempdir()
if(my.data_index) {
# Step 1: Make a dataset with NA values set to 0 in each category.
cat("Preparing the data...")
flush.console()
my.Xstrd <- getStandardData(my.data, my.Hdef)
cat(paste(" done.", "\nCalculating statistics and",
"removing popular cell records...\n"))
flush.console()
# Step 2: Remove records in the popular cells (cutoff=1)
# * remove all records with the same actual _or_ missing values
# * does not currently permit use of an arbitrary hashing definition
my.Xtest <- my.Xstrd
my.H <- new("sdcHashClass", X = my.Xtest)
# * print a characterization of the root hash
print(summary(my.H))
flush.console()
# * identify big cell elements for removal first time through loop
SUcurRecordSet <- identifyBigCellRecords(my.H, cutoff = my.cutoff)
cat(paste(" done. Records remaining to search:",
nrow(my.Xtest) - length(SUcurRecordSet), "out of",
nrow(my.Xtest), "\n"))
flush.console()
# * initialize loop controling variables
SUmaxSize <- ncol(my.data)
SUcurSize <- SUindex <- 0
# * initialize return class
my.target <- new("sdcTargetDefinitionClass")
my.target@dim <- as.integer(dim(my.data))
my.target@dimnames <- dimnames(my.data)
my.target@cutoff <- as.integer(my.cutoff)
# Step 3: Stop processing if left with a small enough final cell
while(nrow(my.Xtest) - length(SUcurRecordSet) > my.cutoff) {
# * increment the size of record-discriminant sought
SUcurSize <- SUcurSize + 1
# * whittle down the data for searching
if(length(SUcurRecordSet)>0) my.Xtest <- my.Xtest[-SUcurRecordSet,]
# * initialize counters
SUcurIter <- 0
SUmaxIter <- choose(length(my.Hdef@levels),
length(my.Hdef@levels) - SUcurSize)
# * initialize collection variables for the outer loop
SUcurVector <- SUcurLength <- vector(length=0, mode="integer")
SUcurRecords <- vector(length=0, mode="integer")
SUcurHashes <- SUcurIndex <- vector(length=0, mode="character")
# Step 3-1: Loop for a particular SU size
cat(paste("\nSearching over ", SUmaxIter,
" target specifications of size ", SUcurSize,
"...", sep=""))
flush.console()
# Step 3-2: Allocate calculations to cores
my.combinations <- combn(length(my.Hdef@levels),
length(my.Hdef@levels) - SUcurSize,
simplify=FALSE)
my.last <- min(floor(length(my.combinations)/2), my.cores)
ddd <- floor(quantile(seq(my.combinations),
seq(0, 1, length.out = my.last + 1)))
my.index <- lapply(seq(ddd[-length(ddd)]),
function(i) { ddd[i]:(ddd[i+1]-1) })
my.index[[my.last]] <- c(my.index[[my.last]], length(my.combinations))
# Step 3-3: Prepare for parallel processing
my.cluster <- makeCluster(min(floor(length(my.combinations)/2), my.cores))
registerDoParallel(my.cluster)
SUobj <- foreach(i = my.index,
.packages = c("tuple", "sdcTarget")) %dopar% {
# Get the subset of combinations for this core
SUcurSubset <- my.combinations[i]
# Initialize collection variables for the inner loop
SUcurVector <- SUcurLength <- vector(length=0, mode="integer")
SUcurRecords <- vector(length=0, mode="integer")
SUcurHashes <- SUcurIndex <- vector(length=0, mode="character")
if(my.debug) {
my.time <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S_%Z")
my.tmp <- paste(paste(rep("0", max(0, 3-nchar(SUcurSize))),
collapse=""), SUcurSize, sep="")
sink(file = tempfile(pattern = paste("sdcTarget_", my.tmp, "_", my.time, sep = ""),
tmpdir = my.tempdir,
fileext = ".txt"))
print("==========")
cat("Index:\n")
print(i)
cat("\nRemaining Data:\n")
print(my.Xtest)
cat("\nHash Definition:\n")
print(my.Hdef)
print("=====")
}
for(SUcurDefinition in SUcurSubset) {
# Allow for tracking the progress of the search in real time
SUcurIter <<- SUcurIter + 1
# SUcurDefinition gives the set of variables included
# SUcurInvDefinition gives the set of variables excluded
SUcurInvDefinition <- setdiff(1:length(my.Hdef@levels), SUcurDefinition)
# SUindex is the overall combination counter that gets incremented
# each time one passes through this loop (i.e., across set sizes)
SUindex <- SUindex + 1
# Htmp holds the variable subset specific hash for testing, with
# missing values propagated to hashes if na.hash is FALSE, given
# the recode of Hdef pursuant to na.hash (see above)
Htmp <- new("sdcHashClass", X = my.Xtest, fields = SUcurDefinition)
if(my.debug) {
print("---")
print(SUcurDefinition)
print(Htmp)
}
# Identify the remaining popular cells
ddd <- identifyBigCells(Htmp, cutoff = my.cutoff)
if(my.debug) print(ddd)
# If there are any big cells, identify their records for exclusion
if(length(ddd)>0) {
# Identify the big cell records (like identifyBigCellRecords)
# (call to the external function would be less efficient here)
eee <- matchAll(unique(ddd), Htmp@hash)
SUcurVector <- c(SUcurVector, ddd)
SUcurLength <- c(SUcurLength, length(ddd))
SUcurRecords <- c(SUcurRecords, eee)
SUcurHashes <- c(SUcurHashes, Htmp@hash[eee])
SUcurIndex <- c(SUcurIndex, rep(paste(SUcurInvDefinition, collapse=" "),
length(eee)))
}
}
if(my.debug) sink()
return(list(SUcurVector = SUcurVector,
SUcurLength = SUcurLength,
SUcurRecords = SUcurRecords,
SUcurHashes = SUcurHashes,
SUcurIndex = SUcurIndex))
}
stopCluster(my.cluster)
# Step 3-3: Put the lists together
for(i in 1:length(SUobj)) {
SUcurVector <- c(SUcurVector, SUobj[[i]]$SUcurVector)
SUcurLength <- c(SUcurLength, SUobj[[i]]$SUcurLength)
SUcurRecords <- c(SUcurRecords, SUobj[[i]]$SUcurRecords)
SUcurHashes <- c(SUcurHashes, SUobj[[i]]$SUcurHashes)
SUcurIndex <- c(SUcurIndex, SUobj[[i]]$SUcurIndex)
}
# Step 4: Save and report the findings
SUcurRecordSet <- unique(SUcurRecords)
my.target@rownames <- c(my.target@rownames,
rownames(my.Xtest[SUcurRecords,]))
my.target@size <- c(my.target@size,
as.integer(rep(SUcurSize, length(SUcurRecords))))
my.target@index <- c(my.target@index, SUcurIndex)
my.target@hash <- c(my.target@hash, SUcurHashes)
cat("\n ...")
cat(paste(" found", length(SUcurVector), "targets of size", SUcurSize,
"in", length(SUcurRecordSet), "records."))
flush.console()
# * uniqueness is assured for residual unmatched data
if(SUcurSize == SUmaxSize - 1) {
if(length(SUcurRecordSet)>0)
my.Xtest <- my.Xtest[-SUcurRecordSet,]
if(nrow(my.Xtest) > 0) {
my.target@rownames <- c(my.target@rownames, rownames(my.Xtest))
my.target@size <- c(my.target@size,
as.integer(rep(SUmaxSize, nrow(my.Xtest))))
my.target@index <- c(my.target@index,
rep(paste(seq(SUmaxSize), collapse=" "),
nrow(my.Xtest)))
Htmp <- new("sdcHashClass", X = my.Xtest)
my.target@hash <- c(my.target@hash, Htmp@hash)
cat(paste("\nAdded", nrow(my.Xtest),
"unique records to the target definition.\n"))
flush.console()
}
break()
} else {
cat(paste("\nRecords remaining to search:",
nrow(my.Xtest)-length(SUcurRecordSet), "\n"))
flush.console()
}
} # end of while()
return(my.target)
} else return(.Object)
})
#' @title
#' Estimate Time To Complete Calculation of Target Definition
#' @description
#' This code provides a rough approximation for the number of
#' seconds that a calculation of target definition may run.
#' @param X
#' A data frame with the data (not necessarily standardised).
#' @param cutoff
#' The minimum residual cell size.
#' @param cores
#' The number of cores to be used in parallel processing.
#' The default is linear processing: i.e., \code{core = 1}.
#' @param na.recode
#' Whether NA values should be treated as levels.
#' @param Hdef
#' A hashing definition (might be generated from a larger set of data).
#' @examples
#' set.seed(256)
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' estimateTimeToComputeTargetDefinition(X = my.X)
#' @export
estimateTimeToComputeTargetDefinition <- function(
X,
cutoff = 1,
cores = 1,
na.recode = TRUE,
Hdef = new("sdcHashingDefinitionClass",
X = X, na.recode = na.recode)
) {
# Variables starting with X derive from the data.
# Variables starting with H relate to the hashes.
#
getStandardData <- function (X, H) {
ddd <- X
if(!all(class(X) %in% "factor"))
ddd <- data.frame(lapply(X, function(Z) { factor(Z) }))
my.special <- matchNone(ddd)
for(i in seq(ddd)) {
if(class(ddd[[i]]) %in% "factor") {
for(j in seq(H@levels[[i]]))
levels(ddd[[i]])[levels(ddd[[i]]) == H@levels[[i]][j]] <-
paste(my.special, j, sep = "")
levels(ddd[[i]]) <- substr(levels(ddd[[i]]),
start = nchar(my.special) + 1,
stop = max(nchar(levels(ddd[[i]]))))
}
if(H@na.recode)
levels(ddd[[i]])[is.na(X[[i]])] <- 0
}
ddd
}
# Get the data
my.data <- X
if(is.null(my.data) || !is.data.frame(my.data))
stop("Data must be provided in a data frame.")
# Check if recoding is specified
my.na_recode <- na.recode
if(!is.logical(my.na_recode) || length(my.na_recode) != 1)
stop("Recoding should be specified as a single logical value.")
# Get the hashing definition
my.Hdef <- Hdef
if(!is(my.Hdef, "sdcHashingDefinitionClass"))
stop("Hashing definition must be of class sdcHashDefinitionClass.")
# Check if cutoff is specified
my.cutoff <- cutoff
if(!is.numeric(my.cutoff) || length(my.cutoff) != 1)
stop("Cutoff should be specified as a single integer value.")
my.cutoff <- as.integer(my.cutoff)
# Check how many cores to use in estimate
my.cores <- cores
if(!is.numeric(my.cores) || length(my.cores) != 1)
stop("Number of cores should be specified as a single integer value.")
my.cores <- as.integer(my.cores)
SUallTime <- as.list(system.time({
# Step 1: Make a dataset with NA values set to 0 in each category.
cat("Preparing the data...")
flush.console()
my.Xstrd <- getStandardData(my.data, my.Hdef)
cat(paste(" done.", "\nCalculating statistics and",
"removing popular cell records..."))
flush.console()
# Step 2: Remove records in the popular cells (cutoff=1)
# * remove all records with the same actual _or_ missing values
# * does not currently permit use of an arbitrary hashing definition
my.Xtest <- my.Xstrd
my.H <- new("sdcHashClass", X = my.Xtest)
# * identify big cell elements for removal first time through loop
SUcurRecordSet <- identifyBigCellRecords(my.H, cutoff = my.cutoff)
cat(paste(" done.\nRecords remaining to search:",
nrow(my.Xtest) - length(SUcurRecordSet), "out of",
nrow(my.Xtest), "\n\n"))
flush.console()
}))$elapsed
# * initialize loop controling variables
SUmaxSize <- ncol(my.data)
SUcurSize <- SUindex <- 0
SUcurTime <- vector(length=0, mode="numeric")
SUcurLength <- vector(length=0, mode="integer")
# Step 3: Stop processing if left with a small enough final cell
while(nrow(my.Xtest) - length(SUcurRecordSet) > my.cutoff) {
SUallTime <- SUallTime + as.list(system.time({
# * increment the size of record-discriminant sought
SUcurSize <- SUcurSize + 1
# * whittle down the data for searching
if(length(SUcurRecordSet)>0) my.Xtest <- my.Xtest[-SUcurRecordSet, , drop = FALSE]
# * initialize counters
my.last <- 3
SUcurIter <- 0
SUmaxIter <- min(choose(length(my.Hdef@levels),
length(my.Hdef@levels) - SUcurSize), my.last)
# Step 3-1: Loop for a particular SU size
cat(paste("Estimating over ", SUmaxIter,
" target specifications of size ", SUcurSize,
"...", sep=""))
flush.console()
# Step 3-2: Prepare to calculate times
my.combinations <- sample(combn(length(my.Hdef@levels),
length(my.Hdef@levels) - SUcurSize,
simplify=FALSE), SUmaxIter)
# Get the subset of combinations for this core
SUcurSubset <- my.combinations
}))$elapsed
SUcurTime <- c(SUcurTime, as.list(system.time(
for(SUcurDefinition in SUcurSubset) {
# Allow for tracking the progress of the search in real time
SUcurIter <<- SUcurIter + 1
# SUcurDefinition gives the set of variables included
# SUcurInvDefinition gives the set of variables excluded
SUcurInvDefinition <- setdiff(1:length(my.Hdef@levels), SUcurDefinition)
# SUindex is the overall combination counter that gets incremented
# each time one passes through this loop (i.e., across set sizes)
SUindex <- SUindex + 1
# Htmp holds the variable subset specific hash for testing, with
# missing values propagated to hashes if na.hash is FALSE, given
# the recode of Hdef pursuant to na.hash (see above)
Htmp <- new("sdcHashClass", X = my.Xtest, fields = SUcurDefinition)
# Identify the remaining popular cells
ddd <- identifyBigCells(Htmp, cutoff = my.cutoff)
}))$elapsed / SUmaxIter)
SUcurLength <- c(SUcurLength, SUcurSize)
print(SUcurTime)
# Step 4: Save the findings
cat(paste(" done.\n"))
flush.console()
my.decay <- 0.5
SUremainder <- floor(nrow(my.Xtest) * (1 - my.decay))
if(SUremainder < SUmaxIter) break()
SUcurRecordSet <- sample(seq(nrow(my.Xtest)),
max(SUmaxIter, SUremainder))
} # end of while()
my.estimate <- choose(ncol(my.data), 1:(ncol(my.data)-1))[1:length(SUcurTime)] *
SUcurTime / my.cores
SUallTime <- SUallTime + sum(my.estimate)
cat(paste("\nEstimated Overall Seconds:", SUallTime, "\n"))
return(invisible(list(overall = SUallTime, inner.loop = my.estimate, cores = cores)))
}
#' @title
#' S4 Target Matrix
#' @description
#' The taget matrix is a matrix with the same dimensions as the
#' data from which it is derived, that indicates the number of
#' combinations at a specific level of targetting for which the
#' synthesis of a data element will make the record "sufficiently
#' common".
#' @details
#' Additional information is stored in the slots.
#' @slot .Data
#' A matrix.
#' @slot Tdef
#' A target definition class object.
#' @export
setClass("sdcTargetMatrixClass",
contains = c("matrix"),
slots = c(Tdef = "sdcTargetDefinitionClass"),
prototype = prototype(Tdef = new("sdcTargetDefinitionClass")),
validity = function(object) {
TRUE
} )
#' @describeIn sdcTargetMatrixClass
#' @param .Object
#' An \code{\linkS4class{sdcTargetMatrixClass}} object.
#' @param ...
#' The optional parameters specifying the basis of the
#' target matrix (Tdef).
#' @examples
#' set.seed(256)
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.tdef <- new("sdcTargetDefinitionClass", X = my.X)
#' new("sdcTargetMatrixClass", Tdef = my.tdef)
#' @seealso \code{\linkS4class{sdcTargetDefinitionClass}},
#' @export
setMethod("initialize", "sdcTargetMatrixClass",
function(.Object, ...) {
# Get any specified constructor elements
my.names <- names(match.call(expand.dots = TRUE))
# Get the data
my.tmp <- pmatch(my.names, c("Tdef", "tdef"))
if(sum(!is.na(my.tmp)) > 1)
stop("Specify only one dataset on which the hashing is to be defined.")
my.tdef_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.tdef <- NULL
if(my.tdef_index)
my.tdef <- eval(parse(text = match.call()[my.names[my.tdef_index]]))
if(!is.null(my.tdef) && !is(my.tdef, "sdcTargetDefinitionClass"))
stop("Target definition not an sdcTargetDefinitionClass object.")
if(!is.null(my.tdef) && (length(my.tdef@index) > 0)) {
# Count of targets found by target specification, ordered by prevalence
# (descending), and within prevelance ordered by number of variables
# counted in definition:
ddd <- table(my.tdef@index)[order(-table(my.tdef@index),
unlist(lapply(gregexpr(" ",names(table(my.tdef@index))),
length)))]
# Count of targets found by number of associated variables (where it is
# assumed that the target definition corresponds to the values identified
# as risky, and not the opposite) times the number of such variables
# divided by total values in dataset (i.e., proportion of SDC data):
eee <- sum(ddd*(unlist(lapply(lapply(gregexpr(" ",
dimnames(ddd)[[1]]), function(Z) {Z>0}), sum))+1)) /
prod(my.tdef@dim) * 100
cat(paste("Overall proportion of values for SDC: ",
signif(eee,3), "%", sep=""))
} else if(!is.null(my.tdef) && any(my.tdef@dim > 0))
cat("Overall proportion of values for SDC: 0%\n")
flush.console()
if(!is.null(my.tdef) && (length(my.tdef@index) > 0)) {
# Retrieve the row name associated with each target
fff <- unlist(lapply(strsplit(my.tdef@rownames, "\\."), function(Z) { Z[1] }))
fffu <- unique(fff)
fffunum <- length(fffu)
# Proportion of rows with data subject to SDC:
ggg <- fffunum / my.tdef@dim[1] * 100
cat(paste("\nProportion of data records subject to SDC: ",
signif(ggg,3), "% (", fffunum,
" records out of ", my.tdef@dim[1], ")", sep=""))
flush.console()
# Proportion of records having multiple targets by target size:
hhhn2 <- hhhn <- hhhd <- vector(length=my.tdef@dim[2], mode="integer")
for(i in seq(my.tdef@dim[2])) {
hhhi <- table(fff[my.tdef@size == i])
hhhn[i] <- length(hhhi[hhhi > 1])
hhhn2[i] <- length(unique(fff[my.tdef@size == i][which(unlist(lapply(
strsplit(my.tdef@rownames[my.tdef@size == i], "\\."),
length)) > 1)]))
hhhd[i] <- length(unique(fff[my.tdef@size == i]))
}
cat(paste("\nProportion of records having multiple targets by target size:",
"\n", paste(signif(hhhn/hhhd*100,3), collapse="% "),
"\n (NaN indicates that no targets of that size were found)",
"\n Check two different calculations: ", identical(hhhn, hhhn2)))
# Check that within record there is only one size of target
cat("\nVerifying that each record associates with only one target size... ")
flush.console()
cat(length(table(apply(table(fff, my.tdef@size), 1, function(Z) {sum(Z>0)})))==1)
# Create target substitution matrix for the data
cat("\nCreating target substitution matrix for the SDC data subset...")
flush.console()
yyy <- matrix(0, fffunum, my.tdef@dim[2])
# * open the target specifications
zzz <- lapply(strsplit(my.tdef@index, " "), as.integer)
# * map the target records to their unique record values
www <- match(fff, fffu)
# * add up the targets in each record
for(i in 1:length(fff))
yyy[www[i], zzz[[i]]] <- yyy[www[i], zzz[[i]]] + 1
cat(" done.\nMapping to original data...")
flush.console()
yyyo <- matrix(0, my.tdef@dim[1], my.tdef@dim[2])
rownames(yyyo) <- my.tdef@dimnames[[1]]
colnames(yyyo) <- my.tdef@dimnames[[2]]
www <- match(fffu, my.tdef@dimnames[[1]])
for(i in 1:length(fffu)) yyyo[www[i],] <- yyy[i,]
cat(" done.")
flush.console()
# Print out various checks on the target substitution matrix
cat("\nMinimum number of records at each level of preservation:")
print(table(apply(yyyo, 1, function(Z) {sum(Z==0)})))
cat("\nPercent synthesis required by field for maximum partial synthesis:\n")
print(signif(apply(yyyo, 2, function(Z) {sum(Z!=0)})/nrow(yyyo)*100, 3))
flush.console()
} else {
yyyo <- matrix(nrow=0, ncol=0)
my.tdef <- new("sdcTargetDefinitionClass")
}
# Prepare to return the sdcTargetMatrixClass object
.Object@.Data <- yyyo
.Object@Tdef <- my.tdef
return(invisible(.Object))
})
#' @title
#' S4 Substitution Matrix
#' @description
#' The substitution matrix is a matrix with the same dimensions as the
#' data from which it is derived, that indicates which elements are to
#' be subjected to a statistical disclosure control process.
#' @details
#' The substitution matrix is calculated from a target matrix.
#' Specification of forwards direction with a \code{cutoff} of 0
#' results in a complete data synthesis substitution matrix.
#' Specification of forwards direction with a \code{cutoff} of 1,
#' or backwards direction with a sufficiently large \code{cutoff},
#' results in the maximum partial synthesis substitution matrix.
#' Specification of backwards direction with a \code{cutoff} of 1
#' results in the minimum partial synthesis substitution matrix.
#' Maximum partial synthesis is the default.
#' @slot .Data
#' A matrix.
#' @slot forwards
#' Indicates the direction in which to process the target matrix.
#' Defaults to \code{TRUE}.
#' @slot cutoff
#' Cutoff with respect to which an element in the target matrix
#' should be indicated for substitution. Default value is 1.
#' @slot T
#' A target matrix object.
#' @export
setClass("sdcSubstitutionMatrixClass",
contains = c("matrix"),
slots = c(forwards = "logical",
cutoff = "numeric",
T = "sdcTargetMatrixClass"),
prototype = prototype(forwards = TRUE,
cutoff = 1,
T = new("sdcTargetMatrixClass",
Tdef = new("sdcTargetDefinitionClass"))),
validity = function(object) {
TRUE
} )
#' @describeIn sdcSubstitutionMatrixClass
#' @param .Object
#' An \code{\linkS4class{sdcSubstitutionMatrixClass}} object.
#' @param ...
#' The optional parameters specifying the basis of the
#' substitution matrix. These include an indicator of
#' forwards or backwards matrix calculation
#' (\code{forwards}),
#' a cutoff value relative to entries in the target matrix
#' (\code{cutoff}), and the target matrix basis for the
#' calculation (\code{T}).
#' @examples
#' set.seed(256)
#' my.X <- data.frame(matrix(ifelse(runif(500)>.5, TRUE, FALSE), ncol = 5))
#' my.smc <- new("sdcSubstitutionMatrixClass",
#' T = new("sdcTargetMatrixClass",
#' Tdef = new("sdcTargetDefinitionClass", X = my.X)))
#' @seealso \code{\linkS4class{sdcTargetMatrixClass}},
#' @export
setMethod("initialize", "sdcSubstitutionMatrixClass",
function(.Object, ...) {
# Get any specified constructor elements
my.names <- names(match.call(expand.dots = TRUE))
# Get the target matrix
my.tmp <- pmatch(my.names, c("Tmatrix", "tmatrix"))
if(sum(!is.na(my.tmp)) > 1)
stop("Specify only one dataset on which the hashing is to be defined.")
my.T_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.T <- NULL
if(my.T_index)
my.T <- eval(parse(text = match.call()[my.names[my.T_index]]))
if(is.null(my.T) || !is(my.T, "sdcTargetMatrixClass"))
stop("No target matrix provided.")
# Check if direction of calculation is specified
my.tmp <- pmatch(my.names, c("forwards"))
my.forwards_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.forwards <- TRUE
if(my.forwards_index)
my.forwards <- eval(parse(text = match.call()[my.names[my.forwards_index]]))
if(!is.logical(my.forwards) || length(my.forwards) != 1)
stop("Direction of calculation (forwards) should be specified as a single logical value.")
# Check if cutoff is specified
my.tmp <- pmatch(my.names, c("cutoff"))
my.cutoff_index <- ifelse(all(is.na(my.tmp)), 0, min(which(!is.na(my.tmp))))
my.cutoff <- 1
if(my.cutoff_index)
my.cutoff <- eval(parse(text = match.call()[my.names[my.cutoff_index]]))
if(!is.numeric(my.cutoff) || length(my.cutoff) != 1)
stop("Cutoff should be specified as a single integer value.")
my.cutoff <- as.integer(my.cutoff)
# Create a working set of target variable names
my.target_names <- colnames(my.T)
# Stop if the target columns are unnamed
if((ncol(my.T) > 0) && is.null(my.target_names))
stop("Columns of the target matrix have no names.")
# Turn the target matrix into a data frame, so that it will keep
# column names and format even if it turns into a single column.
my.Target <- data.frame(my.T)
if(length(my.target_names)==1) names(my.Target) <- my.target_names
# Apply the appropriate directional test
if(my.forwards) {
cat(paste("Applying forwards cut with cutoff ", my.cutoff, ".\n", sep=""))
.Object@.Data <- apply(my.Target, 2, function(Z) { Z >= my.cutoff })
} else {
cat(paste("Applying backwards cut with cutoff ", my.cutoff, ".\n", sep=""))
# Catch if target has been reduced to a vector from a matrix
ddd <- apply(my.Target, 1, max)
.Object@.Data <- apply(my.Target, 2, function(Z) { (Z > 0) & (Z > ddd - my.cutoff) })
}
rownames(.Object@.Data) <- rownames(my.T)
# Fill and return the substitution matrix object
.Object@forwards <- my.forwards
.Object@cutoff <- my.cutoff
.Object@T <- my.T
return(invisible(.Object))
})
#getSdcRows <- function(..., criterion="random",
# rows=NULL, patterns=NULL, like=NULL,
# size=NULL, quantity=NULL,
# X=NULL, TD=NULL,
# n=1) {
# ddd <- charmatch(criterion,
# c("random", "rows", "patterns", "like",
# "size", "quantity"))
# if(!is.null(rows)) ddd <- 2
# if(!is.null(patterns)) ddd <- 3
# if(!is.null(like)) ddd <- 4
# if(ddd==1 && !is.null(TD)) {
# fff <- unlist(lapply(strsplit(TD$RowID, "\\."), function(Y) { Y[1] }))
# if(!is.null(size)) fff <- fff[TD$Size==size]
# if(!is.null(quantity)) fff <- names(table(fff)[table(fff)==3])
# if(is.numeric(n) && length(n)==1 && n>0)
# fff <- sample(fff, min(length(fff),n))
# if(!is.null(X)) return(X[match(fff, rownames(X)),])
# else return(fff)
# } else if(ddd==2 && !is.null(TD)) {
# fff <- unlist(lapply(strsplit(TD$RowID, "\\."), function(Y) { Y[1] }))
# if(!is.null(X)) return(X[match(fff, rownames(X)),])
# else {
# ggg <- matchAll(rows, fff)
# ggg <- ggg[order(TD$RowID[ggg])]
# return(data.frame(RowID=unlist(lapply(strsplit(TD$RowID[ggg], "\\."),
# function(Y) { Y[1] })),
# Size=TD[[2]][ggg],
# Index=TD[[3]][ggg],
# Hash=TD[[4]][ggg]))
# }
# } else if(ddd==3 && !is.null(patterns) && !is.null(TD) && !is.null(X)) {
# if(is.numeric(patterns))
# eee <- matchAll(paste(sort(patterns), collapse=" "), TD$Index)
# else eee <- matchAll(patterns, TD$Index)
# fff <- unique(unlist(lapply(strsplit(TD$RowID[eee], "\\."),
# function(Y) { Y[1] })))
# ggg <- X[match(fff[order(TD$Hash[fff])], rownames(X)),]
# return(ggg)
# } else if(ddd==4 && !is.null(like) && !is.null(TD) && !is.null(X)) {
# fff <- unlist(lapply(strsplit(TD$RowID, "\\."), function(Y) { Y[1] }))
# ggg <- matchAll(like, fff)
# hhh <- getSdcRows(patterns=unique(TD$Index[ggg]), TD=TD, X=X)
# return(hhh)
# } else {
# stop("Please give a full specification.")
# }
#}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.