Nothing
#
# Reconstruction Set Test (RESET)
#
# @author rob.frost@dartmouth.edu
#
#
# Implementation of the RESET method.
#
# X: n-by-p target matrix; columns represent variables and rows represent samples
# X.test: matrix (subset or transformation of X, e.g., projection on top PCs) that will be combined with the var.set variables
# to compute the reduced rank reconstruction via randomized SVD.
# Reconstruction error will be measured on the variables in X.test.
# If not specified, the X matrix will be used.
# center.X: Flag which controls whether the values in X are mean centered during execution of the algorithm.
# If only X is specified and center.X=TRUE, then all columns in X will be centered. If both X and X.test are specified,
# then centering performed on just the columns of X contained in the specified variable sets.
# scale.X: Flag which controls whether the values in X are scaled to have variance 1 during execution of the algorithm.
# If only X is specified and scale.X=TRUE, then all columns in X will be scaled. If both X and X.test are specified,
# then scaling is performed on just the columns of X contained in the specified variable sets.
# center.X.test: Flag which controls whether the values in X.test are mean centered during execution of the algorithm.
# scale.X.test: Flag which controls whether the values in X.test are scaled to have variance 1 during execution of the algorithm.
# var.sets: list of m variable sets, each element is a vector of indices of variables in the set
# k: Rank of reconstruction. Default to 2. Must be greater than or equal to the minimum variable set size.
# random.threshold: If specified, indicates the variable set size above which a randomized reduced-rank reconstruction is used. If
# the variable set size is less or equal to random.threshold, then a non-random reconstruction is computed. Defaults to k
# and cannot be less than k.
# k.buff: Additional dimensions used in randomized reduced-rank construction algorithm. Defaults to 0.
# q: Number of power iterations for randomized SVD. Defaults to 0.
# test.dist: Distribution for elements of random test matrix used in randomized algorithm. Must be one of "normal" or "uniform".
# norm.type: The type of norm to use for computing reconstruction error. Defaults to "2" for Euclidean/Frobenius norm. Other supported option
# is "1" for L1 norm.
# per.var: If true, the computed scores for each variable set are divided by the variable set size to generate "per variable" scores.
#
# Returns a list with two elements:
#
# S: n-by-m matrix of sample-level variable set scores computed by RESET (not returned if single.sample is false)
# v: length m vector of the overall variable set scores computed by RESET
#
reset = function(X, X.test, center.X=TRUE, scale.X=FALSE, center.X.test=TRUE, scale.X.test=FALSE,
var.sets, k=2, random.threshold, k.buff=0, q=0, test.dist="normal", norm.type="2",
per.var=FALSE) {
if (missing(X)) {
stop("Matrix X not specified!")
}
if (missing(var.sets)) {
stop("List of variable set indices var.sets must be specified!")
}
num.sets = length(var.sets)
if (num.sets == 0) {
stop("Must specify at least one variable set!")
}
min.set.size = min(sapply(var.sets, length))
if (min.set.size == 0) {
stop("All variable sets must contain at least one member!")
}
if (k < 1) {
stop("k must be greater than 1!")
}
if ((k+k.buff) > min.set.size) {
stop("k+k.buff is ", k+k.buff, ", which is larger than the minimum set size of ", min.set.size, "!")
}
if (missing(random.threshold)) {
random.threshold = k
message("Setting random.threhold to specified k of ", k)
} else if (random.threshold < k) {
stop("random.threshold cannot be than k!")
}
if (k.buff < 0) {
stop("k.buff cannot be negative!")
}
if (q < 0) {
stop("q cannot be negative!")
}
if (norm.type != "2" && norm.type != "1") {
stop("unsupported norm.type ", norm.type)
}
# if (!missing(weight.type)) {
# if (!(weight.type %in% c("mean", "stand.mean", "mean.abs", "stand.mean.abs"))) {
# stop("weight.type must be one of 'mean', 'stand.mean', 'mean.abs' or 'stand.mean.abs'!")
# }
# }
p = ncol(X)
n = nrow(X)
m = length(var.sets)
# If requested, centering X or X.test
if (!missing(X.test)) {
if (center.X.test || scale.X.test) {
X.test = scale(X.test, scale=scale.X.test, center=center.X.test)
}
} else {
if (center.X || scale.X) {
X = scale(X, scale=scale.X, center=center.X)
}
X.test = X
}
# Compute the L1 or L2 norm of each row and the test matrix
X.row.norms = apply(X.test, 1, function(x){vectorNorm(x,norm.type)})
# Get correct matrix norm type
m.norm = norm.type
if (norm.type == "2") {
m.norm = "F"
}
X.norm = norm(X.test, type=m.norm)
# matrix that holds sample level scores
S = matrix(0,nrow=n,ncol=m)
# vector that holds overall set scores
v = rep(0, m)
message("Computing scores for collection of ", m, " sets")
var.set.sizes = unlist(lapply(var.sets, length))
mean.var.set.size = mean(var.set.sizes)
for (i in 1:m) {
var.set = var.sets[[i]]
var.set.name = names(var.sets)[i]
var.set.size = var.set.sizes[i]
if (i %% 50 == 0) {
message("Computing scores for set ", i, " (", var.set.name, ") with size ", var.set.size)
}
# Subset X for the variable set
X.var.set = X[,var.set]
# If X.test was specified and center.X is true, then
# the subset needs to be centered
if (!missing(X.test) && (center.X || scale.X)) {
if (scale.X) {
# Remove any zero variance columns
nonzero.var = which(apply(X.var.set, 2, function(x) {min(x) != max(x)}))
num.nonzero.var = length(nonzero.var)
if (num.nonzero.var != var.set.size) {
warning("Removing ", (var.set.size-num.nonzero.var), " columns with zero variance from set ", var.set.name)
X.var.set = X.var.set[,nonzero.var]
}
}
X.var.set = scale(X.var.set, center=center.X, scale=scale.X)
}
# Compute sample-level weights if needed
# if (!missing(weight.type)) {
# var.vals = X.var.set
# if (endsWith(weight.type, ".abs")) {
# var.vals = abs(var.vals)
# }
# var.set.weights = apply(var.vals, 1, mean)
# if (startsWith(weight.type, "stand.")) {
# var.set.weights = scale(var.set.weights, center=FALSE, scale=TRUE)
# }
# }
# Get rank k basis for column space of X.var.set
if (var.set.size <= random.threshold) {
# Variable set size is <= random.threshold,
# so compute orthonormal basis directly using column-pivoted QR
Q = qr.Q(qr(X.var.set, LAPACK=TRUE))
} else {
# Variable set size is > random.threshold, so use a randomized algorithm
# implemented by randomColumnSpace to compute the rank k orthonormal basis
Q = randomColumnSpace(X=X.var.set,k=(k+k.buff),q=q,test.dist=test.dist)
}
if (ncol(Q) < k) {
warning("Q has less than k columns!")
}
# Use the first k columns of Q as the rank k basis
Q = Q[,1:k]
# Compute reconstruction of X.test by projecting onto Q
X.recon = Q %*% (t(Q) %*% X.test)
# Compute reconstruction error
X.diff = X.test - X.recon
# Compute norm of reconstruction error matrix
X.diff.norm = norm(X.diff,type=m.norm)
if (X.diff.norm == 0) {
warning("0 reconstruction error for set ", i, "! Overall score will be set to infinity.")
}
# Compute the row-level scores as log2 fold-change of the L1 or L2 norm of original row relative to the equivalent norm of the error.
# Larger values reflect improved reconstruction performance.
X.diff.norms = apply(X.diff, 1, function(x){vectorNorm(x,norm.type)})
if (length(which(X.diff.norms == 0)) > 0) {
warning("0 reconstruction error for set ", i, "! Overall score will be set to infinity.")
}
S[,i] = log2(X.row.norms/X.diff.norms)
# if weight.type is specified, then multiple scores by the appropriate weights
# if (!missing(weight.type)) {
# S[,i] = S[,i] * var.set.weights
# }
# Compute overall score as the log2 fold-change of the L1 or Frobineus norm of the original test matrix relative to the relative to the equivalent norm
# of the error
v[i] = log2(X.norm/X.diff.norm)
# If per-variable scores are desired, scale by var.set.size
if (per.var) {
S[,i] = S[,i]/(var.set.size/mean.var.set.size)
v[i] = v[i]/(var.set.size/mean.var.set.size)
}
}
S[which(is.nan(S))] = 0
rownames(S) = rownames(X)
colnames(S) = names(var.sets)
names(v) = names(var.sets)
results = list()
results$S = S
results$v = v
return (results)
}
#
# Wrapper around the reset() function that uses the top PCs as X.test.
#
resetViaPCA = function(X, center=TRUE, scale=FALSE, num.pcs=2, pca.buff=2, pca.q=1,
var.sets, k=2, random.threshold, k.buff=0, q=0, test.dist="normal", norm.type="2",
per.var=FALSE) {
if (missing(X)) {
stop("Matrix X not specified!")
}
if (num.pcs < 1) {
stop("num.pcs cannot be less than 1!")
}
if (pca.buff < 0) {
stop("pca.buff cannot be negative!")
}
if (pca.q < 0) {
stop("pca.q cannot be negative!")
}
if (missing(random.threshold)) {
random.threshold = k
}
# center/scale as desired
if (center || scale) {
X = scale(X, center=center, scale=scale)
}
# use randomized SVD algorithm to compute approximate projection onto the top PCs
X.pcs = X %*% randomSVD(X=X,k=(num.pcs+pca.buff),q=pca.q,test.dist=test.dist)$v[,1:num.pcs]
# call reset()
# if the data was not centered before calling randomSVD, then center it when
# needed in reset()
reset.results = reset(X=X,X.test=X.pcs,center.X=!center,center.X.test=!center,
scale.X=FALSE,scale.X.test=FALSE, var.sets=var.sets,
k=k, random.threshold=random.threshold, k.buff=k.buff,
q=q,test.dist=test.dist,norm.type=norm.type, per.var=per.var)
return (reset.results)
}
#
# Utility function that creates a variable set list in the format required
# by rest() (i.e., list of variable indices) given the variable names and a
# list of variable names.
#
# Inputs:
#
# -var.names: Vector of variable names. This should correspond to the order of variables in
# the target matrix X.
# -var.sets: List of m variable sets where each element in the list corresponds to
# a set and the list element is a vector variable names. List names are variable set names.
# -min.size: Minimum set size after filtering out variable not in the var.names vector.
# Sets whose post-filtering size is below this are removed from the final
# collection list. Default is 1 and cannot be set to less than 1.
# -max.size: Maximum variable set size after filtering out variables not in the var.names vector.
# Sets whose post-filtering size is above this are removed from the final
# collection list. If not specified, no filtering is performed.
#
# Output:
#
# Version of the input variable set collection list where variable names have been replaced by position indices,
# variables not present in the var.names vector have been removed and sets failing the min/max size
# constraints have been removed.
#
createVarSetCollection = function(var.names, var.sets, min.size=1, max.size) {
# min.size must be at least 1
if (min.size < 1) {
stop("Invalid min.size value! Must be 1 or greater.")
}
# If max size is set, make sure it is not less than min size
if (!missing(max.size)) {
if (max.size < min.size) {
stop("max.size cannot be less than min.size!")
}
}
num.vars = length(var.names)
if (num.vars < 1) {
stop("var.names must contain at least one variable!")
}
num.sets = length(var.sets)
if (num.sets < 1) {
stop("var.sets must contain at least one set!")
}
set.names = names(var.sets)
var.set.indices = list()
for (i in 1:num.sets) {
set.var.names = var.sets[[i]]
# map names to indices
set.indices = unlist(sapply(set.var.names, function(x){which(var.names == x)}))
set.size = length(set.indices)
if (set.size < min.size) {
next
}
if (!missing(max.size)) {
if (set.size > max.size) {
next
}
}
current.index = length(var.set.indices)+1
var.set.indices[[current.index]] = set.indices
names(var.set.indices)[current.index] = set.names[i]
}
return (var.set.indices)
}
#
# Convert the overall and sample-level RESET scores to/from per-variable scores.
#
# Inputs:
#
# reset.out: Results from reset() or resetViaPCA()
# var.sets: List containing variable set indices (this must be the same list used to call reset()).
# to.per.var: If true, converts to per-variable scores, i.e., divides scores by variable set size.
# If false, converts from per-variable scores, i.e., multiplies scores by variable set size.
#
# Returns a modified RESET output list.
#
convertToPerVarScores = function(reset.out, var.sets, to.per.var=TRUE) {
if (missing(reset.out)) {
stop("reset.out must be specified!")
}
if (missing(var.sets)) {
stop("var.sets must be specified!")
}
# Compute the size of each variable set
var.set.size = unlist(lapply(var.sets, length))
# Scale the sizes by the mean size
# Thiw will prevent the scores from dramatically changing in magnitude.
var.set.size = var.set.size/mean(var.set.size)
if (to.per.var) {
reset.out$v = reset.out$v/var.set.size
reset.out$S = t(apply(reset.out$S, 1, function(x) {
return (x/var.set.size)
}))
} else {
reset.out$v = reset.out$v * var.set.size
reset.out$S = t(apply(reset.out$S, 1, function(x) {
return (x * var.set.size)
}))
}
return (reset.out)
}
#
# Utility function that computes the L1 or L2 norm of a vector
#
vectorNorm = function(x, norm.type="2") {
if (norm.type == "1") {
return (sum(abs(x)))
} else if (norm.type == "2") {
return (sqrt(sum(x^2)))
} else {
stop("Invalid norm.type ", norm.type)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.