################################################################################
#
# iprior: Linear Regression using I-priors
# Copyright (C) 2018 Haziq Jamil
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
################################################################################
# The main difficulty with estimating I-prior models is the number of
# hyperparameters to estimate depend on which kernels are used. In kernL2(),
# kernels are specified by the user, which are then applied to the data x. From
# this, we create a "param" table which specifies the kernels used for each
# variable. The param table must be converted to the "theta" vector which is the
# vector of parameters to estimate. This file contains helper functions to
# convert between kernels, param, and theta. In particular, we have: 1.
# param_to_kernel(); 2. theta_to_kernel(); 3. param_to_theta(); 4.
# kernel_to_param(); 5. theta_to_param(); 6. theta_to_collapsed_param(); 7.
# theta_to_psi().
#
# We then also have several other helper functions, in particular
# expand_Hl_and_lambda() and get_Hl() which are also used in predict().
param_to_kernel <- function(param) {
# Convert param table to vector of kernels.
#
# Args: param table.
#
# Returns: Vector of kernels.
as.character(param$kernels)
}
theta_to_kernel <- function(theta, object) {
# Convert theta to vector of kernels.
#
# Args: theta and an ipriorKernel object.
#
# Returns: Vector of kernels.
param <- theta_to_param(mod$theta, mod$kernL)
param_to_kernel(param)
}
param_to_theta <- function(param, est.list, logpsi = 0) {
# Convert param table to the theta vector. Note that theta is designed so that
# the values are unbounded, i.e. hurst is Phi^{-1}(hurst), lengthscale is
# log(lengthscale), etc. theta_to_param() reverses this for final
# presentation.
#
# Args: A param table, the list of parameters to be estimated and optional
# logpsi value.
#
# Returns: theta is a vector of parameters to be passed to optim or EM for
# optimisation, including the logpsi value.
param <- param[, seq_len(4)] # lambda, hurst, lengthscale, offset
param$hurst <- qnorm(param$hurst)
param$lengthscale <- log(param$lengthscale)
param$offset <- log(param$offset)
if (nrow(param) == 1) param$lambda <- log(param$lambda)
tmp <- collapse_param(param)
theta.full <- c(tmp$param, psi = logpsi)
param.na <- tmp$na
tmp <- reduce_theta(theta.full, est.list)
theta.reduced <- tmp$theta.reduced
theta.drop <- tmp$theta.drop
theta.omitted <- tmp$theta.omitted
list(theta = theta.reduced, param.na = param.na, theta.drop = theta.drop,
theta.omitted = theta.omitted)
}
kernel_to_param <- function(kernels, lambda) {
# Convert vector of kernels to a param table.
#
# Args: kernels is a p-vector of kernels to apply on the data. lambda are the
# scale parameters.
#
# Returns: The param table.
param <- as.data.frame(matrix(NA, ncol = 5, nrow = length(kernels)))
names(param) <- c("lambda", "hurst", "lengthscale", "offset", "degree")
param$lambda <- lambda
kernel.match <- c("fbm", "se", "poly", "poly")
for (i in seq_along(kernel.match)) {
res <- rep(NA, length(kernels))
where_kernel <- grepl(kernel.match[i], kernels)
for (j in seq_len(sum(where_kernel))) {
res[where_kernel][j] <- get_hyperparam(kernels[where_kernel][j])
if (i == 4) res[where_kernel][j] <- get_polydegree(kernels[where_kernel][j])
}
param[, i + 1] <- res
}
res <- cbind(param, kernels)
res$kernels <- as.character(res$kernels)
res
}
reduce_theta <- function(theta.full, est.list) {
# The user may specify for some of the hyperparameters to not be estimated,
# therefore theta vector should be smaller than the full set of
# hyperparameters. This function trims theta correctly.
#
# Args: theta.full is the original and full theta vector. est.list is
# generated in kernL2() which is a list telling us which parameters are
# estimated and which are not.
#
# Returns: The trimmed theta vector.
theta.full.orig <- theta.full
# Estimate Hurst coefficient? ------------------------------------------------
est.lambda <- est.list$est.lambda
ind.lambda <- grepl("lambda", names(theta.full))
theta.full[ind.lambda][!est.lambda] <- NA
# Estimate Hurst coefficient? ------------------------------------------------
est.hurst <- est.list$est.hurst
ind.hurst <- grepl("hurst", names(theta.full))
theta.full[ind.hurst][!est.hurst] <- NA
# Estimate lengthscale in SE kernel? -----------------------------------------
est.l <- est.list$est.lengthscale
ind.l <- grepl("lengthscale", names(theta.full))
theta.full[ind.l][!est.l] <- NA
# Estimate offset in polynomial kernel? --------------------------------------
est.c <- est.list$est.offset
ind.c <- grepl("offset", names(theta.full))
theta.full[ind.c][!est.c] <- NA
# Estimate error precision psi? ----------------------------------------------
est.psi <- est.list$est.psi
ind.psi <- grepl("psi", names(theta.full))
theta.full[ind.psi][!est.psi] <- NA
theta.drop <- is.na(theta.full)
theta.reduced <- theta.full[!theta.drop]
theta.omitted <- theta.full.orig[theta.drop]
list(theta.reduced = theta.reduced, theta.omitted = theta.omitted,
theta.drop = theta.drop)
}
expand_theta <- function(theta.reduced, theta.drop, theta.omitted) {
# This convertes the reduced theta vector back to the full theta vector.
# Useful in other functions such as theta_to_param().
#
# Args: theta.reduced is obvious. theta.drop is information on which of the
# full theta was not estimated. theta.omitted contains the values for which
# theta component that was not estimated. These are all generated by
# param_to_theta().
#
# Returns: The full theta.
theta.full <- theta.drop
theta.full[!theta.drop] <- theta.reduced
theta.full[theta.drop] <- theta.omitted
theta.full
}
collapse_param <- function(param) {
# Args: A param table.
#
# Returns: A vectorised form of param with the na values removed. The param.na
# values are output here too.
#
# Notes: Used as a helper function in param_to_theta(), and also useful for
# final presentation of the parameters as this function names the parameters
# too.
res <- na.omit(unlist(param[, 1:4]))
param.names <- names(res)
na <- as.numeric(na.action(res))
res <- as.numeric(res)
param.digits <- gsub("[^[:digit:]]", "", param.names)
param.names <- gsub("[[:digit:]]", "", param.names)
param.names <- paste0(param.names, "[", param.digits, "]")
param.names <- gsub("[[]]", "", param.names)
names(res) <- param.names
list(param = res, na = na)
}
theta_to_param <- function(theta, object) {
# Args: A vector of parameters to be optimised, including logpsi. object must
# be either a ipriorKernel type object, or a list containing param.na,
# which.pearson and poly.deg.
#
# Returns: A param table.
#
# Notes: The logpsi value is removed. To obtain this use theta_to_psi(). If
# object is specified, then the param.na, which.pearson and poly.deg are
# obtained from object.
param.na <- object$thetal$param.na
which.pearson <- object$which.pearson
poly.deg <- object$poly.deg
theta <- expand_theta(theta, object$thetal$theta.drop,
object$thetal$theta.omitted)
theta <- theta[-length(theta)]
full.length <- length(c(theta, param.na))
param <- matrix(NA, ncol = 4, nrow = full.length / 4)
tmp <- c(param)
tmp[-param.na] <- theta
param[] <- tmp
param <- cbind(param, degree = poly.deg)
param <- as.data.frame(param)
names(param) <- c("lambda", "hurst", "lengthscale", "offset", "degree")
param$hurst <- pnorm(param$hurst)
param$lengthscale <- exp(param$lengthscale)
param$offset <- exp(param$offset)
if (nrow(param) == 1) param$lambda <- exp(param$lambda)
param$kernels <- correct_pearson_kernel(
apply(param, 1, param_translator), which.pearson
)
param
}
param_translator <- function(x) {
# Helper function in theta_to_param().
#
# Args: Row vector from param table.
#
# Returns: The kernel used.
hyperparam <- x[-1]
if (!is.na(hyperparam[1]))
return(paste0("fbm,", hyperparam[1]))
if (!is.na(hyperparam[2]))
return(paste0("se,", hyperparam[2]))
if (!is.na(hyperparam[3]))
return(paste0("poly", hyperparam[4], ",", hyperparam[3]))
"linear"
}
correct_pearson_kernel <- function(x, which.pearson) {
# When using theta_to_param(), unable to identify which data x uses the
# Pearson kernel. This helper function corrects it by reading from the logical
# which.pearson vector.
#
# Args: The kernel vector and which.pearson (logical), indicating which of the
# x position uses the Pearson kernel.
#
# Returns: The corrected kernel vector.
x[which.pearson] <- "pearson"
x
}
kernel_translator <- function(x, y = NULL, kernel, lam.poly = 1) {
# Used as a helper function in get_Hl() to output list of kernel
# matrices in kernL2() and predict(). For future expansion, add new kernels
# here.
#
# Args: x, y (optional) data and kernel a character vector indicating which
# kernel to apply x and y on. lam.poly is the scale for polynomial kernels.
#
# Returns: A kernel matrix.
if (grepl("linear", kernel)) return(kern_linear(x, y))
if (grepl("canonical", kernel)) return(kern_linear(x, y))
if (grepl("fbm", kernel)) {
if (grepl(",", kernel)) {
hurst <- get_hyperparam(kernel)
return(kern_fbm(x, y, gamma = hurst))
} else {
return(kern_fbm(x, y))
}
}
if (grepl("se", kernel)) {
if (grepl(",", kernel)) {
lengthscale <- get_hyperparam(kernel)
return(kern_se(x, y, l = lengthscale))
} else {
return(kern_se(x, y))
}
}
if (grepl("poly", kernel)) {
if (grepl(",", kernel)) {
offset <- get_hyperparam(kernel)
} else {
offset <- 0
}
degree <- get_polydegree(kernel)
return(kern_poly(x, y, c = offset, d = degree, lam.poly = lam.poly))
}
if (grepl("pearson", kernel)) return(kern_pearson(x, y))
stop("Incorrect kernel specification or unsupported kernel.",
call. = FALSE)
}
theta_to_collapsed_param <- function(theta, object) {
# This is a wrapper function for theta_to_param(). It is useful to get
# the vector of parameters directly from theta.
#
# Args: theta (usually theta.full) and the ipriorKernel object.
#
# Returns: A vector of parameters.
param <- theta_to_param(theta, object)
c(collapse_param(param)$param, theta_to_psi(theta, object))
}
theta_to_psi <- function(theta, object) {
# Obtains psi from the theta vector, or if not estimated, from the
# ipriorKernel object.
#
# Args: A vector of parameters to be optimised, including logpsi, and an
# ipriorKernel object.
#
# Returns: psi, the error precision.
theta <- expand_theta(theta, object$thetal$theta.drop,
object$thetal$theta.omitted)
logpsi <- theta[length(theta)]
exp(logpsi)
}
get_hyperparam <- function(x) {
# Obtain the hyperparameters of kernels. Note: the exported version is
# get_hyp().
#
# Args: x is either a character vector of the kernel specification (e.g. x <-
# "fbm,0.5"), or it is the output from one of the kern_x() functions.
#
# Returns: The hyperparameter associated with the kernel (value that comes
# after the comma).
if (!is.null(attributes(x)$kernel)) x <- attributes(x)$kernel
as.numeric(unlist(strsplit(x, ","))[2])
}
get_polydegree <- function(x) {
# Obtain the degree of the polynomial kernel. Note: the exported version is
# get_degree().
#
# Args: x is either a character vector of the kernel specification (e.g. x <-
# "fbm,0.5"), or it is the output from one of the kern_x() functions.
#
# Returns: Integer.
if (!is.null(attributes(x)$kernel)) x <- attributes(x)$kernel
degree <- unlist(strsplit(x, ","))[1]
degree <- unlist(strsplit(degree, "poly"))
if (length(degree) == 1) degree <- 2
else degree <- as.numeric(degree[2])
degree
}
get_kernels_from_Hl <- function(x) {
# Obtain the kernels used to generate the list of kernel matrices.
#
# Args: x is a list of kernel matrices.
#
# Returns: A character vector of kernels.
sapply(x, function(x) attributes(x)$kernel)
}
get_Xl.nys <- function(object) {
# When using the Nystrom method, we calculate the kernel matrix using
# get_Hl(Xl, Xl.nys) which produces a smaller m x n matrix. This is the helper
# function to obtain Xl.nys.
#
# Args: An ipriorKernel object with Nystrom option called.
#
# Returns: Xl.nys, a list similar to Xl but only the first m rows are returned
# (after sampling that is).
nys.check <- is.ipriorKernel_nys(object)
if (isTRUE(nys.check)) {
Xl.nys <- lapply(object$Xl, reorder_x,
smp = seq_len(object$nystroml$nys.size))
mostattributes(Xl.nys) <- attributes(object$Xl)
return(Xl.nys)
} else {
stop("Nystrom option not called.", call. = FALSE)
}
}
get_Hl <- function(Xl, yl = list(NULL), kernels, lambda) {
# Obtain the list of kernel matrices. Except for polynomial kernels, these are
# not scaled, i.e. not Hlam matrices.
#
# Args: List of data Xl and yl (optional), vector of same length of kernel
# characters to instruct kernel_translator() which kernels to apply each of
# the x and y. lambda is needed for the polynomial kernels.
#
# Returns: List of kernel matrices.
mapply(kernel_translator, Xl, yl, kernels, lambda, SIMPLIFY = FALSE)
}
expand_Hl_and_lambda <- function(Hl, lambda, intr, intr.3plus, env = NULL) {
# Helper function to expand Hl (list of kernel matrices) and lambda (scale
# parameters) according to any interactions specification.
#
# Args: List of kernel matrices (Hl) and vector of scale parameters (lambda).
# intr and intr.3plus are obtained from kernL2 which specifies the
# interactions. env is the environment in which to assign the results.
#
# Returns: A list of expanded Hl and lambda. If env = NULL, then this list is
# returned. If an environment is specified, it is directly assigned there.
p <- length(Hl)
no.int <- max(0, ncol(intr))
no.int.3plus <- max(0, ncol(intr.3plus))
if (!is.null(intr) & length(intr) > 0) {
for (j in seq_len(no.int)) {
ind1 <- intr[1, j]; ind2 <- intr[2, j]
lambda[p + j] <- lambda[ind1] * lambda[ind2]
Hl[[p + j]] <- Hl[[ind1]] * Hl[[ind2]]
attr(Hl[[p + j]], "kernel") <- paste(attr(Hl[[ind1]], "kernel"),
attr(Hl[[ind2]], "kernel"),
sep = " x ")
}
}
if (!is.null(no.int.3plus) & length(intr.3plus) > 0) {
for (j in seq_len(no.int.3plus)) {
lambda[p + no.int + j] <- Reduce("*", lambda[intr.3plus[,j]])
Hl[[p + no.int + j]] <- Reduce("*", Hl[intr.3plus[, j]])
intr.3plus.kernels <- sapply(Hl[intr.3plus[, j]], attr, "kernel")
attr(Hl[[p + no.int + j]], "kernel") <- paste(intr.3plus.kernels,
collapse = " x ")
}
}
if (is.null(env)) return(list(Hl = Hl, lambda = lambda))
else {
assign("Hl", Hl, envir = env)
assign("lambda", lambda, envir = env)
}
}
BlockB_fn <- function(Hl, intr, n, p) {
# This is the function which returns the BlockBStuff required for closed-form
# EM algorithm.
#
# Args: The list of kernel matrices, (two-way) interaction specifications
# intr, sample size n and number of variables p.
#
# Returns: BlockBStuff - a bunch of precalculated kernel matrices and a
# function BlockB which tells iprior_em_closed() how to manipulate these
# matrices.
# Initialise -----------------------------------------------------------------
Hl <- expand_Hl_and_lambda(Hl, seq_along(Hl), intr, NULL)$Hl
environment(index_fn_B) <- environment()
H2l <- Hsql <- Pl <- Psql <- Sl <- ind <- ind1 <- ind2 <- NULL
BlockB <- function(k, x = lambda) NULL
if (length(Hl) == 1L) {
# CASE: Single lambda ------------------------------------------------------
Pl <- Hl
Psql <- list(fastSquare(Pl[[1]]))
Sl <- list(matrix(0, nrow = n, ncol = n))
BB.msg <- "Single lambda"
} else {
# Next, prepare the indices required for index_fn_B().
z <- seq_along(Hl)
ind1 <- rep(z, times = (length(z) - 1):0)
ind2 <- unlist(lapply(2:length(z), function(x) c(NA, z)[-(0:x)]))
# Prepare the cross-product terms of squared kernel matrices
for (j in seq_along(ind1)) {
H2l.tmp <- Hl[[ind1[j]]] %*% Hl[[ind2[j]]]
H2l[[j]] <- H2l.tmp + t(H2l.tmp)
}
if (!is.null(intr)) {
# CASE: Parsimonious interactions only ---------------------------------
for (k in z) {
Hsql[[k]] <- fastSquare(Hl[[k]])
if (k <= p) ind[[k]] <- index_fn_B(k) # only create indices for non-intr
}
BlockB <- function(k, x = lambda) {
# Calculate Psql instead of directly P %*% P because this way
# is < O(n^3).
indB <- ind[[k]]
lambda.P <- c(1, x[indB$k.int.lam])
Pl[[k]] <<- Reduce("+", mapply("*", Hl[c(k, indB$k.int)], lambda.P,
SIMPLIFY = FALSE))
Psql[[k]] <<- Reduce("+", mapply("*", Hsql[indB$Psq],
c(1, x[indB$Psq.lam] ^ 2),
SIMPLIFY = FALSE))
if (!is.null(indB$P2.lam1)) {
lambda.P2 <- c(rep(1, sum(indB$P2.lam1 == 0)), x[indB$P2.lam1])
lambda.P2 <- lambda.P2 * x[indB$P2.lam2]
Psql[[k]] <<- Psql[[k]] +
Reduce("+", mapply("*", H2l[indB$P2], lambda.P2, SIMPLIFY = FALSE))
}
lambda.PRU <- c(rep(1, sum(indB$PRU.lam1 == 0)), x[indB$PRU.lam1])
lambda.PRU <- lambda.PRU * x[indB$PRU.lam2]
Sl[[k]] <<- Reduce("+", mapply("*", H2l[indB$PRU], lambda.PRU,
SIMPLIFY = FALSE))
}
BB.msg <- "Multiple lambda with parsimonious interactions"
} else {
# CASE: Multiple lambda with no interactions, or with non-parsimonious -
# interactions ---------------------------------------------------------
for (k in seq_along(Hl)) {
Pl[[k]] <- Hl[[k]]
Psql[[k]] <- fastSquare(Pl[[k]])
}
BlockB <- function(k, x = lambda) {
ind <- which(ind1 == k | ind2 == k)
Sl[[k]] <<- Reduce("+", mapply("*", H2l[ind], x[-k], SIMPLIFY = FALSE))
}
BB.msg <- "Multiple lambda with no interactions"
}
}
list(H2l = H2l, Hsql = Hsql, Pl = Pl, Psql = Psql, Sl = Sl, ind1 = ind1,
ind2 = ind2, ind = ind, BlockB = BlockB, BB.msg = BB.msg)
}
index_fn_B <- function(k) {
# Indexer helper function used to create indices for H2l. This is a helper
# function for BlockB_fn()
#
# Args: k is the index (out of p scale parameters) which is currently looped.
# Note that intr, ind1 and ind2 are created in kernL().
#
# Returns: A list of various information which helps manipulates the kernel
# matrices in BlockB_fn().
ind.int1 <- intr[1, ] == k; ind.int2 <- intr[2, ] == k # locating var/kernel matrix
ind.int <- which(ind.int1 | ind.int2) # of interactions (out of 1:no.int)
k.int <- ind.int + p # which kernel matrix has interactions involves k
k.int.lam <- c(intr[1, ][ind.int2], intr[2, ][ind.int1]) # which has interaction with k?
nok <- (1:p)[-k] # all variables excluding k
k.noint <- which(!(ind.int1 | ind.int2)) + p # the opposite of k.int
# P.mat %*% R.mat + R.mat %*% P.mat indices ----------------------------------
grid.PR1 <- expand.grid(k, nok)
za <- apply(grid.PR1, 1, findH2, ind1 = ind1, ind2 = ind2)
grid.PR2 <- expand.grid(k.int, nok)
zb <- apply(grid.PR2, 1, findH2, ind1 = ind1, ind2 = ind2)
grid.PR.lam <- expand.grid(k.int.lam, nok)
# P.mat %*% U.mat + U.mat %*% P.mat indices ----------------------------------
grid.PU1 <- expand.grid(k, k.noint)
zc <- apply(grid.PU1, 1, findH2, ind1 = ind1, ind2 = ind2)
grid.PU2 <- expand.grid(k.int, k.noint)
zd <- apply(grid.PU2, 1, findH2, ind1 = ind1, ind2 = ind2)
grid.PU.lam <- expand.grid(k.int.lam, k.noint)
# P.mat %*% P.mat indices ----------------------------------------------------
grid.Psq <- t(combn(c(k, k.int), 2))
ze <- apply(grid.Psq, 1, findH2, ind1 = ind1, ind2 = ind2)
grid.Psq.lam <- NULL
if (length(k.int.lam) > 0) grid.Psq.lam <- t(combn(c(0, k.int.lam), 2))
list(
k.int = k.int,
k.int.lam = k.int.lam,
PRU = c(za, zc, zb, zd),
PRU.lam1 = c(rep(0, length(nok) + length(k.noint)),
grid.PR.lam[,1],
grid.PU.lam[,1]),
PRU.lam2 = c(nok, k.noint, grid.PR.lam[,2], grid.PU.lam[,2]),
Psq = c(k, k.int),
Psq.lam = k.int.lam,
P2 = ze,
P2.lam1 = grid.Psq.lam[,1],
P2.lam2 = grid.Psq.lam[,2]
)
}
findH2 <- function(z, ind1, ind2){
# This function finds position of H2 (cross-product terms of H). Used in
# index_fn_B().
#
# Args: z is a dataframe created from expand.grid(), while ind1 and ind2 are
# the indices of the cross-product matrices. These are created in BlockB_fn().
#
# Returns: A logical vector for use in index_fn_B().
x <- z[1]; y <- z[2]
which((ind1 == x & ind2 == y) | (ind2 == x & ind1 == y))
}
tab_intr_3plus <- function(x) {
# Helper function which tabulates >2-way interactions.
#
# Args: x is a character vector of the form "1:2:3" etc. which basically says
# that variable 1 interacts with variable 2 and 3.
#
# Returns: A matrix of interactions information.
p <- max(sapply(strsplit(x, ":"), length))
sapply(strsplit(x, ":"), function(x) {
p_ <- length(x)
if (p_ < p) as.numeric(c(x, rep(0, p - p_)))
else as.numeric(x)
})
}
where_int <- function(x) {
# Function to determine where interactions are. Used in formula_to_xy().
#
# Args: x is a table obtained from the terms of a formula fit.
#
# Returns: The same table x but with replaced values.
tmp <- x > 0
x[tmp] <- which(x > 0)
x[!tmp] <- 0
x
}
which_intr_3plus <- function(x) {
# Function to determine the positions of the >2-way interactions.
#
# Args: x is a character vector of the form "1:2:3" etc. which basically says
# that variable 1 interacts with variable 2 and 3.
#
# Returns: A logical vector, with TRUE indicating that that position has a
# >2-way interaction.
sapply(strsplit(x, ""), function(x) length(x) > 3)
}
reorder_x <- function(x, smp) {
# For the Nystrom method, the data (vector or matrix) needs to be reordered
# according to the random sample.
#
# Args: x (vector or matrix), and smp is the sequence of reordering.
#
# Returns: A reordered vector or matrix x.
if (is.null(nrow(x))) res <- x[smp]
else res <- x[smp, ]
mostattributes(res) <- attributes(x)
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.