R/pqc_main.R In obleeker/quant.pca: Principal Quantile Components Analysis

```# Helper Functions --------------------------------------------------------

.weight.calc = function(X,tau,init=FALSE){
# helper function to either calculate weights based on Reconstruction OR init randomly (based on dim of the data matrix X)

# get dimensions of argument
n = nrow(X)
p = ncol(X)

if(init) { # if used for random init of weights
weights = matrix(runif(n=n*p),nrow=n,ncol=p) # output random weigths at desired levels
} else {
weights = ifelse(X>=0,tau,1-tau) # calculate weights of argument
}
return(weights) # returns weigths as matrix of same size
}

.check.loss = function(X, tau) { # calculates scaled quantile check loss (used in cross validation)
return(sum(.weight.calc(X,tau)*abs(X))/length(X))
}

# Internal minimization function -----------------------------------------

.pqc.optim = function(X, Z, basis, tau, lambda = 1, const=FALSE) { # This is the function containing the Quadratic Programming
not.na = !is.na(X) # X is a vector, so check how many entries are not empty
X = X[not.na] # remove NA entries from vector
Z = Z[not.na] # same for the weight vector

n = length(X) # remaining number of observations in data column/row

k = ncol(basis) # target rank, taken from the basis

if (!is.null(k)) { # excpetion for k=1 (then basis is not a matrix)
basis = matrix(basis[not.na,],ncol=k) # make sure that basis is a matrix
} else { # if not then assign k=1 and convert basis to vector
k = 1
basis = matrix(basis[not.na],ncol=1)
}

s.mat = matrix(0, nrow=n,ncol=2*n) # this forms the last columns of the constraint matrix (called B in document)

for (i in 1:n) { # adds indicators for every positive negative residual (could have been done by joining indicators)
s.mat[i,i] = 1
s.mat[i,i+n] = -1
}
A = cbind(basis/sqrt(matrix(Z, nrow=n,ncol=k,byrow=FALSE)), s.mat) # first columns of constraint matrix, basis divided by weights

u.mat = diag(x = lambda,nrow = k,ncol=k) # regularization factors corresponding to the v or u to be estimated

H = matrix(0, nrow = k+2*n,ncol=k+2*n) # matrix in the objective function
H[1:k,1:k] = u.mat # regulization factors
H[(k+1):(n+k),(k+1):(n+k)] = tau^2*diag(ncol=n,nrow=n)*2 # factors for positive residuals
H[(n+k+1):(2*n+k),(n+k+1):(2*n+k)] = (1-tau)^2*diag(ncol=n,nrow=n)*2 # factors for negative residuals
b = as.matrix(X/sqrt(Z))

d = rep(0,k+2*n) # make sure no order 1 part in objective function

alpha = quadprog::solve.QP(Dmat = H,dvec = d,Amat = t(A),bvec = b,meq=n) # call quadprog, meq makes sure everything is handled as equality
out = alpha\$solution[1:k] # we only need the first k results
return(out)
}

#' @importFrom foreach foreach %dopar%
.pqc.next = function (X, Z, basis, tau, lambda, doPar, muEst) {
n = ncol(X) # get data matrix dimensions
p = nrow(X)
k = ncol(basis) # get desired rank

if (muEst) { # if mu is to be estimated (only for calc of U)
X = X - basis[,1] # substract mu
k = k-1 # reduce the rank in the function
basis = basis[,-1] # remove mu from basis
}

out = matrix(NA, nrow = p, ncol = k)# initialize output matrix

if (doPar) { # if parallel is activated us this
out = foreach(i = 1:p, .combine = 'rbind') %dopar% {   # this will calculate every row of U/V at once
.pqc.optim(X[i,, drop=FALSE], Z[i,,drop=FALSE], basis, tau, lambda) # call to quadratic programming wrapper (above)
}
} else { # if not parallel
for(i in 1:p) { # loop for every row in U/V
out[i,] = .pqc.optim(X[i,, drop=FALSE], Z[i,,drop=FALSE], basis, tau, lambda) # call to quadratic programming wrapper (above)
}
}

if (muEst) { # now adjust U in case mu was estimated (add all-one column)
out = cbind(rep(1,p), out)
}
return(out)
}

# Main PQC algorithm -----------------------------------------------------

#' Principal Quantile Components
#'
#' @description Calculate principal quantile components (PQC) via smooth approximation of the asymmetric L1-Norm.
#'
#' @param data data matrix with rows as data entries and columns as variables
#' @param projDim no. of desired principal components
#' @param tau regulization parameter
#' @param lambda regulization parameter
#' @param muEst calculate a constant mu
#' @param epsilon approxiamtion parameter
#' @param iterTol no. of max iterataions
#' @param convTol set algorithm to stop if weigths did not change for convTol no. of consecutive iterations (deactivated for tau=0.5)
#' @param preOut continues based on previous output (if doSeq is TRUE, the components of a previos PCA method are sufficient)
#' @param progBar optional progressbar
#' @param doPar use parallel backend (*nix systems recommended),
#' @param doSeq run via sequential optimization
#' @param randomSeed select a random seed as in set.seed for fixed initialization
#'
#'
#' @examples
#' # generating data
#' n = 100
#' X = data.frame(cbind(rnorm(n),rnorm(n),rnorm(n)))
#' # running the main method
#' pqcomp(data = X, projDim = 2, tau =  0.9, lambda = 0.1, muEst = T)
#'
#' @export
pqcomp = function(data, # data matrix with rows as data entries and columns as variables
projDim, # no. of desired principal components
tau, # asymmetry parameter (b'w 0 and 1)
lambda, # regulization parameter
muEst = TRUE,# calculate a constant
epsilon = 1/log10(nrow(data)), # approxiamtion parameter
iterTol = 200, # no. of max iterataions
# Optionals
convTol = NA, # set algorithm to stop if weigths did not change for convTol no. of consecutive iterations (deactivated for tau=0.5)
preOut = NA, # continues based on previous output (if doSeq is TRUE, the components of a previos PCA method are sufficient)
progBar = TRUE, # optional progressbar
doPar = FALSE, # use parallel backend (*nix systems recommended),
doSeq = FALSE, # run via sequential optimization
randomSeed = NA # select a random seed as in set.seed for fixed initialization
)
{
## this is entirely optional but will show a progress bar, loss value, and weight convergence indicator while the algorithm runs
if (progBar) {
pb = progress::progress_bar\$new( # creates template for progress bar
format = "  estimating tau: :tau [:bar] :percent loss: :approx :wconv",
total = iterTol, clear = FALSE, width= 70)
pb\$tick(0)
}
if (doPar) { # installs required parallel packages
cores = parallel::detectCores() - 1 # sets no. of cores to number of available cores (minus 1)
cl = parallel::makeForkCluster(cores)
doParallel::registerDoParallel(cl)
}

if(!is.na(convTol) && tau==0.5) { # exception not to allow early stopping if tau=0.5
warning("Cannot use timeout if tau=0.5, setting convTol=NA")
convTol=NA
}

# get data dimensions and rename rank
n = nrow(data)
p = ncol(data)
k = projDim

loss = c() # init container for loss values

conv = 0 # convergence break counter if convTol!=NA

X = t(data) # transposes data matrix
na = is.na(X) # save indeces of NA values

# Init U and V with random values
if (!is.na(randomSeed)) { # exception if seed was given
set.seed(randomSeed)
}
if (!is.na(preOut)) { # check for previous output
V = preOut\$components # take components from previous output
Z = matrix(1,nrow=p,ncol=n) # initialize weigths at 1 (no weight)
if (!doSeq) { # sequential algo only requires a base
Z = preOut\$weights # take weights from previous output
}
} else { # regular case: random initialization
Z = matrix(1,nrow=p,ncol=n) # init weights
Z[na] = NA # also set NA values were X are NA (avoid division by NA later)
U = matrix(rnorm(n*k, mean=0, sd=1), nrow=n, ncol=k) # init U randomly
V = matrix(rnorm(p*k, mean=0, sd=1), nrow=p,ncol=k) # init V randomly

if (muEst) { # if mu is estimated, do an initial calculation of mu
mu = .pqc.next(X, Z, basis = matrix(1, ncol=1, nrow=n), tau=tau, lambda = lambda, doPar=doPar, muEst=FALSE) # this will calculate baseline mu

V = cbind(mu, V) # adjoin V with mu
U = cbind(rep(1,n),U) # adjoin U with all-one column
}
}

R.old = matrix(0, nrow=p, ncol=n) # init Residiual matrix for previous iteration

# Set-up loop requirements
continue=TRUE
iter = 0

while (continue) { # loop until continue=F
U.next = .pqc.next(X=t(X), Z=t(Z), basis = V, tau = tau, lambda = lambda, doPar=doPar, muEst = muEst) # here muEst tells the minimization wrapper to demean X and add the all-on column back later
if (doSeq) {
}
V.next = .pqc.next(X=X, Z=Z, basis = U, tau = tau, lambda = lambda, doPar=doPar, muEst = FALSE) # calculate the components matrix (here muEst says not to demean X)

# Calculate residuals
R = X - V.next%*%t(U.next)

# Calculate the quantile check loss
diff = sum(abs(.weight.calc(R[!na],tau)*R[!na]))

# Check wether the weights from the previous iteration are the same
weight.conv = all(.weight.calc(R[!na],tau)==(.weight.calc(R.old[!na],tau)))

# Save check loss for later information
loss = append(loss,diff)

# Calculate weigths for the next iteration
Z.next = sqrt((.weight.calc(R,tau)^2)*(R^2)+epsilon^2)

# Prepare for next iteration, rename all matrices
R.old = R
V = V.next
U = U.next
Z = Z.next

iter = iter+1 # increase iteration number

if (iter>iterTol) { # break if max iterations are reached
continue=FALSE
} else { # if not
if (progBar) { # show progress if progBar is activated
pb\$tick(tokens = list(tau = tau, approx = floor(diff), wconv = ifelse(weight.conv==TRUE, "[c]", "[.]")))
} else{ # print simple indicator that iteration has ended
cat(".",file=stderr())
}
}
if (weight.conv==TRUE) { # additional exception if algo is set to timeout when weights converged
if (!is.na(convTol)) { # increase a counter everytime the weights are the same
conv = conv + 1
continue = !(conv>convTol) # set algo to break if weights stayed the same for as long as specified
}
} else {
conv = 0 # reset counter everytime weights change
}
}
print("Finished!")
if (doPar) { # parallel needs to stop the processes
parallel::stopCluster(cl)
}

# Finally name the components matrix accordingly
if (muEst) {
colnames = c("constant")
} else {
colnames = c()
}
for(i in 1:k) {colnames = append(colnames,paste("PC",i,sep=""))}

colnames(V) = colnames
rownames(V) = colnames(data)

# Output as a list containing some additional information
}

# Cross Validation --------------------------------------------------------

.createNA <- function (x, shareNA) { # routine to create random NA values in a dataset according to % size
n = nrow(x) # dimensions
p = ncol(x)
NAloc = rep(FALSE, n * p) # init indeces as all not NA
NAloc[sample.int(n * p, floor(n * p * shareNA))] = TRUE # randomly add TRUE for holdout
x[matrix(NAloc, nrow = n, ncol = p)] = NA # set data to NA at the TRUE locations
return(x)
}

#' Cross Validation for PQC
#'
#' @description Wrapper for pqcomp(): Select ranges for rank and lambda.
#' Results are calculated on a random hold out on the supplied dataset.
#'
#' @param data data matrix with rows as data entries and columns as variables
#' @param projDimRange # range of desired principal components
#' @param tau asymmetry parameter (b'w 0 and 1)
#' @param lambdaRange regulization parameter
#' @param muEst calculate a constant mu
#' @param epsilon approxiamtion parameter
#' @param shareNA size of holdout (0<shareNA<1)
#' @param iterTol no. of max iterataions
#' @param convTol set algorithm to stop if weigths did not change for convTol no. of consecutive iterations (deactivated for tau=0.5)
#' @param preOut continues based on previous output (if doSeq is TRUE, the components of a previos PCA method are sufficient)
#' @param progBar optional progressbar
#' @param doPar use parallel backend (*nix systems recommended),
#' @param doSeq run via sequential optimization
#'
#' @return list containing pqcomp() results
#' @references Madeleine Udell et al. (2016), "Generalized Low Rank Models".
#' @export
cv.pqc = function(data,
projDimRange,
tau,
lambdaRange,
muEst = TRUE,
epsilon = 1/log10(nrow(data)),
shareNA = 0.1,
iterTol = 200,
convTol = NA,
progBar = TRUE,
doPar = TRUE,
doSeq = FALSE) {
num = length(lambdaRange)*length(projDimRange) # how many parameter combinations
Y.na = .createNA(data, shareNA) # create NA values in data
na.loc = is.na(Y.na) # save location of true parameters
Y.true = data[na.loc] # save true values

results = list() # init result list

i = 1 # init counter
for (lambda in lambdaRange) { # loop for every lambda
for (k in projDimRange) { # loop for every projDim
print(paste(i,num,sep="/")) # just print at which iteration we are

# regular PQC call with parameter combination
pqc = pqcomp(Y.na, projDim = k,tau = tau,lambda = lambda, muEst=muEst, epsilon = epsilon, iterTol = iterTol, convTol = convTol, progBar = progBar, doPar = doPar,  doSeq=doSeq)

result = list(lambda = lambda, k=k, na.pred = na.pred, na.true = Y.true,loss=pqc\$loss[length(pqc\$loss)], converged = pqc\$converged) # create list of result infos
results = append(results, list(result)) # adjoin to results
i = i+1 # increment counter
}
}
return(results)
}

#' Cross Validation Scores
#'
#' @description Calculate cross validation scores created by cv.pqc() via quantile check loss.
#'
#' @param cv.result list output of cv.pqc()
#' @param tau asymmetry parameter (0<tau<1)
#'
#' @return list containing pqcomp() results
#'
#' @export
cv.scores = function(cv.result, tau) { # This will calculate the CV score using the object created by cv.pqc
scores = lapply(cv.result,function(x) data.frame(k=x\$k, lambda = x\$lambda, loss = x\$loss, cv.score = .check.loss(x\$na.true-x\$na.pred,tau))) # calculate the CV score based on true and predicted values for every result
score.df = do.call("rbind", scores) # put scores into dataframe
score.df["lambda"] = factor(score.df\$lambda) # put lambda in the column into factors
score.df["cv.score.resc"] = rescale(score.df\$cv.score, to = c(0,1)) # add a rescaled score for comparison
return(score.df)
}
```
obleeker/quant.pca documentation built on July 7, 2019, 12:41 a.m.