#' Maximum Autocorrelation Factors
#'
#' This function performs a linear transformation of the data where the new time series
#' are mutually orthogonal with identity covariance matrix.
#' The input is a matrix of column vector time series.
#' @param x A matrix where each column is a time series. The number of
#' rows must be greater than the number of columns
#' @return An object of class "Maf" containing the following items:
#' \describe{
#' \item{\code{x}}{The original data matrix.}
#' \item{\code{autocor}}{A vector of autocorrelations for each maf factor.}
#' \item{\code{rotation}}{A matrix of rotation vectors, each column representing a vector,
#' with the first column containing the first rotation (MAF1), the second representing
#' the second rotation and so on. Each column's squares sum to 1.}
#' \item{\code{mafs}}{Contains the matrix with maf factors in the
#' columns, with highest autocorrelation in the first column.}
#' }
#' @examples
#' # Extract mafs from dataset
#' maf.object = maf(treeringTimeseries)
#' # Plot the first 6 mafs with undertainty estimates and estimate number of
#' # significant mafs contained in the dataset
#' plot(maf.object)
#-------------
#' @export
maf <- function(x) {
if (class(x)=="data.frame" || (class(x)=="matrix" && dim(x)[2]>1)){
p = dim(x)[2]
n = dim(x)[1]
x = scale(x)
svd = svd(cov(x)) #If you want to sum of the timesteps/n to be 1
#svd = svd(cov(x)*(n-1)) #If you want to sum of the timesteps to be 1
a = svd$u%*%diag(svd$d^(-0.5))%*%t(svd$u)
y = x%*%a
cov.zd = cov(apply(y,2,diff))*(n-2)
svd.zd = svd(cov.zd/(n-1))
u = svd.zd$u[,p:1]
aa = a%*%u
aa = apply(aa,2,function(x) {x/sqrt(sum(x^2))})
maf = x%*%aa
neg.time.cor = diag(cov(maf,matrix(rep(1:n,p),n,p)))<0
mafs = t(apply(maf,1,function(x) {(-neg.time.cor+!neg.time.cor)*x}))
aa = t(apply(aa, 1,function(x) {(-neg.time.cor+!neg.time.cor)*x}))
d = svd.zd$d[p:1]
} else {
print("x is not a matrix or a data.frame, no MAF transform performed.")
mafs = as.matrix(x)
aa = as.matrix(1); a.i = as.matrix(1); d = as.matrix(1)
}
out = list(x=x, mafs=maf, rotation=aa, autocor=1 - d/2)
class(out) = c("Maf")
out
}
#This function does what "filter" does for 2 sides except that
#the vector length is preserved by renormalizing the filter window with
#the available timesteps. f must be odd length.
#If f is a scalar, the value is assumed to be the number of years per degree of
#freedom. E.g. if f = 10, a 100 length vector will have 10 DoF and 11 knots.
filter2 <- function(x,f) {
x = as.matrix(x)
l = dim(x)[1]
p = dim(x)[2]
if (length(f) == 1) {
if (f == 1 || f == 0) {
return(x)
} else {
if (f < 1 && f > 0) {
return(apply(as.matrix(x),2,function(y) {
loess(y~c(1:l), span=f)$fitted
}))
} else {
return(apply(as.matrix(x),2,function(y) {
loess(y~c(1:l), span=2*f/l)$fitted
}))
}
}
}
}
# Function: blockBs(x, block.size, circular = F)
# ---------------------------------
# This function will take in a matrix of size n-by-p and return a resampled version
# with the same predictor order and size but permuted rows. The permutation will be in
# blocks of size "block.size". The circular argument refers to weather if a block randomly
# starts after n-blocksize, in which case the block would have to continue at the
# beginning in order to be complete. Otherwise, such selections would not be possible.
# 7/22/14: Cannot handle d>0 yet.
blockBs <- function(x, blocksize, circular = T, d=0) {
x = as.matrix(x)
n = dim(x)[1]
p = dim(x)[2]
length = 0
indeces = NULL
Ex = colMeans(x)
if (!d==0) {
dx = x-x[1,]
for (i in 1:d) {
dx = as.matrix(apply(dx, 2, diff))
}
}
#Getting new indeces
if (!circular) {
while (length(indeces)<n) {
startInd = sample(1:(n-blocksize+1-d),1)
indeces = c(indeces,startInd:(startInd+blocksize-1))
}
} else {
while (length(indeces)<(n-d)) {
startInd = sample(1:(n-d),1)
v = startInd:(startInd+blocksize-1)
if ((startInd+blocksize-1)>(n-d)) {
indeces = c(indeces,((v-1) %% (n-d))+1) # makes the indeces larger than (n-d)
# start from the beginning
} else {
indeces = c(indeces,v)
}
}
}
if (!d==0) {
print(dim(dx))
print(length(indeces[1:(n-d)]))
dxstar = as.matrix(dx[indeces[1:(n-d)],])
ixstar = dxstar
for (i in 1:d) {
ixstar = x + dxstar
xstar = x[1,] + rbind(0, ixstar)
}
} else {
xstar = x[indeces[1:(n-d)],]
}
xstar
}
getSnrEmpir <- function(x, f) {
if (class(x) != "matrix") {
out = getSnrEmpirSub(x, f)
} else {
out = apply(x, 2, getSnrEmpirSub, f)
}
out
}
#Get the empirical snr based on filter size f and time series input x.
#outputs a scalar.
getSnrEmpirSub <- function(x, f, plotit=F, ...) {
signal = filter2(x, f)
noise = x - signal
snr = var(signal)/var(noise)
if (plotit) {
plot(x, type="l", ...)
lines(signal, col=3, lwd=2)
}
snr
}
# Get the sign of the max of the absolute value of each vector entry
vectorAbsMaxSign <- function(v) {
output = rep(0, length(v))
maxAbsV = max(abs(v))
maxV = max(v)
minV = min(v)
if (maxAbsV == maxV) {
output[which.max(v)] = 1
} else if (maxAbsV == -minV) {
output[which.min(v)] = -1
}
output
}
#' Test number of significant maf time series in data
#'
#' Perform a statistical test for how many signals are hiding in the data.
#' The output gives p-values for each time series
#' @param maf.object The output of the \code{maf} function.
#' @param smooth.span Fraction between 0-1 that specifies the proportion of timesteps
#' to include in the smoothing window, which is weighted by a tricubic. See ?loess for
#' details. Alternatively, one can specify an integer greater than 1 which refers to the
#' number of time points to include in the tricubic filter.
#' @param B Number of replications in the confidence interval calculations
#' @param alpha The significance level
#' @param block.size The block size of the resampled residuals, i.e. the number of
#' contiguous time steps to sample at the time from the set of residuals.
#' @return A list with the following items
#' \describe{
#' \item{\code{statStar}}{ Empirical SNR for each resampled MAF time series, i.e. a p x B matrix
#' where p is the total number of predictors and B is the number of bootstraps/replications}
#' \item{\code{statObs}}{ The empirical SNR of the original MAF time series.}
#' \item{\code{pval}}{The p-values of each MAF, where the p-value refers to the number of
#' resampled MAFs that have a higher empirical SNR than the original MAFs. If there are less
#' than \eqn{\alpha}, e.g. 0.05, resampled MAFs that have a higher empirical SNR than the
#' original MAF, the MAF in question is not significant.}
#' \item{\code{mafStar}}{ The B resampled maf timeseries, in a 3d array.}
#' \item{\code{mafSmooth}}{ The smooth original maf estimates.}
#' }
#' @details
#' The confidence intervals are obtained using a resampling scheme which extracts the residuals of the original
#' time series after smoothing
#' with the same filter as the one parametrized by \code{smooth.span}. The residuals are
#' bootstrapped or block bootstrapped (if there is temporal structure in the residuals)
#' and added back to the smooth, creating a new data set. From the new data set a new set of
#' MAFs are calculated.
#'
#' The smoothing parameter \code{smooth.span} is also used when calculating the empirical
#' Signal-to-Noise ratio. Where we subtract the smooth from the time series and calculate
#' the variance of the smooth estimate over the residual variance. This gives an empirical
#' signal-to-noise estimate.
#'
#' @examples
#' # Extract mafs from dataset
#' maf.object = maf(treeringTimeseries)
#' # Test for how many mafs are present
#' test.Maf(maf.object)
#-------------
# There are two methods:
# method 1: Do subtract the smooth and calculate test statistic
# p val = number of test stats that are greater than original
# method 2: Do NOT subtract the smooth and calculate test statistic
#' @export
test.Maf <- function(maf.obj, alpha=0.05, block.size=5, smooth.span=30, B=100) {
mafs = (maf.obj$mafs)
n = dim(mafs)[1]
p = dim(mafs)[2]
mafStar = array(0, dim=c(B, n, p))
statStar = matrix(0, p, B)
x = maf.obj$x
mafSmooth = filter2(mafs, smooth.span)[, 1:p]
xSmooth = filter2(x, smooth.span)
residuals = x - xSmooth
statObs = sort(getSnrEmpir(mafs, smooth.span), decreasing=T)
for (i in 1:B){
residualsStar = blockBs(residuals, block.size)
# xStar = residualsStar # method 1
xStar = xSmooth + residualsStar # method 2
mafStarUnsigned = as.matrix(maf(xStar)$mafs[, 1:p])
corMat = cor(mafStarUnsigned, mafSmooth)
permutation = t(apply(corMat, 1, vectorAbsMaxSign))
mafStar[i,,] = mafStarUnsigned %*% permutation
statStar[, i] = sort(getSnrEmpir(maf(residualsStar)$mafs[, 1:p], smooth.span), decreasing=T)
}
# calculate p-values
j = 0
pval = apply(statStar, 1, function(x) {
j <<- j + 1
sum(x > statObs[j]) / B # method 1
# sum(x > statObs[j]) / B # method 2
})
if (class(statStar) != "matrix") {
statStar = matrix(statStar, 1, B)
}
if (all(pval < alpha)) {
nSignificantMafs = p
} else {
nSignificantMafs = min(which(pval > alpha)) - 1 # method 1
# nSignificantMafs = min(which(pval > alpha)) # method 2
}
print(paste("Estimate of # of MAFs in dataset: ", nSignificantMafs))
output = list(pval=pval, statStar=statStar, statObs=statObs, mafStar=mafStar, mafSmooth=mafSmooth)
}
#' Plot maf object
#'
#' This function will plot the first few mafs in the dataset overlaid
#' with a smooth version of the same maf and, if desired, confidence intervals. The
#' legend shows the emirical signal-to-noise ratio, i.e. signal variance (across time) divided by noise variance
#' Additionally, an estimate of the number of mafs will be printed.
#' @param maf.object The output of the \code{maf} function.
#' @param smoothSpan Fraction between 0-1 that specifies the proportion of timesteps
#' to include in the smoothing window, which is weighted by a tricubic. See ?loess for
#' details. Alternatively, one can specify an integer greater than 1 which refers to the
#' number of time points to include in the tricubic filter.
#' @param cexVal The size of the labels in the plot
#' @param nmaf The number of MAFs to plot
#' @param with.wncertainty A logical specifying whether to plot the confidence interval
#' around each maf.
#' @param B Number of replications in the confidence interval calculations
#' @param alpha The significance level
#' @param block.size The block size of the resampled residuals, i.e. the number of
#' contiguous time steps to sample at the time from the set of residuals.
#' @return A plot of the MAFs and a list containing
#' \describe{
#' \item{\code{statStar}}{ Empirical SNR for each resampled MAF time series, i.e. a p x B matrix
#' where p is the total number of predictors and B is the number of bootstraps/replications}
#' \item{\code{statObs}}{ The empirical SNR of the original MAF time series.}
#' \item{\code{pval}}{The p-values of each MAF, where the p-value refers to the number of
#' resampled MAFs that have a higher empirical SNR than the original MAFs. If there are less
#' than \eqn{\alpha}, e.g. 0.05, resampled MAFs that have a higher empirical SNR than the
#' original MAF, the MAF in question is not significant.}
#' }
#' @details
#' The confidence intervals are obtained using a resampling scheme which extracts the residuals of the original
#' time series after smoothing
#' with the same filter as the one parametrized by \code{smoothSpan}. The residuals are
#' bootstrapped or block bootstrapped (if there is temporal structure in the residuals)
#' and added back to the smooth, creating a new data set. From the new data set a new set of
#' MAFs are calculated.
#'
#' The smoothing parameter \code{smooth.span} is also used when calculating the empirical
#' Signal-to-Noise ratio. Where we subtract the smooth from the time series and calculate
#' the variance of the smooth estimate over the residual variance. This gives an empirical
#' signal-to-noise estimate.
#'
#' @examples
#' # Extract mafs from dataset
#' maf.object = maf(treeringTimeseries)
#' # Plot the first 6 mafs with undertainty estimates and estimate number of
#' # significant mafs contained in the dataset
#' plot(maf.object)
#-------------
#' @export
plot.Maf <- function(maf.obj, smooth.span=30, cexVal=1.5,
which.maf=c(1:min(dim(maf.obj$x)[2], 3)),
with.uncertainty=TRUE, B=100, alpha=0.05,
block.size=5) {
mafs = maf.obj$mafs
n = dim(mafs)[1]
p = dim(mafs)[2]
if (with.uncertainty) {
output = test.Maf(maf.obj, alpha, block.size, smooth.span, B)
mafSmooth = output$mafSmooth
} else {
output = NULL
mafSmooth = filter2(mafs, smooth.span)[, 1:p]
}
snr = round(getSnrEmpir(mafs, smooth.span), digits=3)
# Plot
nmaf = length(which.maf)
if (nmaf < 3) {
par(mfrow=c(1, nmaf))
} else {
par(mfrow=c(ceiling(nmaf / 3), 3))
}
for (i in which.maf) {
par(mar=c(4,4,4,2))
plot(mafs[, i], cex.axis=cexVal, cex.main=cexVal, main=paste("MAF", i),
cex.lab=cexVal, ylim=range(c(mafs[, which.maf])), col=gray(0.5), type="l",
ylab="", xlab="Time step")
lines(mafSmooth[, i], lwd=3, col=1)
if (with.uncertainty) {
mafStar = output$mafStar
upper = apply(mafStar[,,i], 2, quantile, 1-alpha/2)
lower = apply(mafStar[,,i], 2, quantile, alpha/2)
lines(upper, lty=2, lwd=1)
lines(lower, lty=2, lwd=1)
}
legend('topleft', c(paste("SNR:", snr[i])), lwd=2)
}
}
#' High dimensional MAF extraction
#'
#' This is a generalization of the MAF algorithm when the sample size (length of time series)
#' is smaller than the number of time series, i.e. n < p. The idea is to first partition the set of time series
#' into smaller groups such that the n > p and the problem is well-posed with a non-singular covariance
#' matrix. From each group we extract the first MAF and then
#' @param mat A matrix of dimension n x p. Tested only for dimensions smaller than n/p~10000.
#' @param alpha Tuning parameter \in [0, 1] that describes the size of the partitioning groups. Viz,
#' group size <= n * alpha. We find for the cases we tried that the results are not that dependant
#' on the value of \alpha as long as it's in the range of 0.2 to 0.8.
#' @param type What type of partition to use (default is 'random'). We find that a random partition yields
#' the best results. There are theoretical justifcations for this in an upcoming paper.
#'
#' @return A list containing the following items
#' \describe{
#' \item{\code{out}}{MAF1 time series}
#' \item{\code{a}}{The set of weights which when multiplied by the original time series produces MAF1.}
#' }
#' @details
#' Currently only implemented for the first MAF, for higher MAFs either stay tuned or subtract out MAF1 from the data set,
#' e.g. by linear regression, and then run the algorithm on the residuals.
#'
#' @examples
#' # Make a signal
#' n = 150 # number of time steps
#' p = 600 # number of time series
#' rho = 0.4 # noise cross correlation
#' cov_e = matrix(rho, p, p) # noise covariance
#' diag(cov_e) = 1
#' noise = mvrnorm(n=n, mu=rep(0, p), Sigma=cov_e)
#' x=seq(0,4,len=n);
#' signal = (x-1)*(x-2)*(x-3)
#' signal.sd = c(rep(0.2, p/2), rep(0.6, p/2))
#' signals = signal %*% t(signal.sd) / sd(signal)
#' data = signals + noise
#' result = MAFpart(data)
#' plot(result$out, type="l", xlab='Time steps', ylab='Signal (Unitless)',
#' main='High Dimensional Signal Extraction')
#' lines(lm(result$out ~ signal)$fit, lwd=2, col=2)
#' legend('topleft', c('Estimate', 'Truth'), col=c(1,2), lwd=c(1,2))
#-------------
#' @export
MAFpart <- function(mat, alpha=0.2, type="random") {
p = dim(mat)[2]
n = dim(mat)[1]
if (is.vector(mat)) {
out = mat
a = 1
} else if (p<=(n * alpha)) {
maf = MAF(mat)
out = as.matrix(maf$out[,1], n, 1)
a = maf$a[,1]
} else {
m = ceil(p/(alpha * n))
if (type=="random") {
v = rep(1:m, ceil(p / m))
ind = v[sample(v, p)] # shuffle randomly!
} else if (type=="kmeans") {
ind = kmeans(cov(mat), m)$clu
} else if (type=="kmediod") {
ind = pam(cov(mat), m, cluster.only=TRUE)
} else {
print("not a valid type")
}
myList = list(); length(myList) = m
out = list(); length(myList) = m
v = list(); length(myList) = m
for (i in 1:m) {
myList[[i]] = mat[, ind == i]
maf = MAFpart(myList[[i]], type=type)
out[[i]] = maf$out
v[[i]] = maf$a
}
new_mat = do.call(cbind, out)
mafrec = MAFpart(new_mat, type=type)
out = mafrec$out
w = mafrec$a
a = rep(0, p)
for (i in 1:m) {
a[ind == i] = v[[i]] * w[i]
}
a = a / sqrt(sum(a^2))
}
list(out=out, a=a)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.