Nothing
## PMA package is needed to install CVR
## PMA depends on impute, which is removed from CRAN to bioconuctor
## Use the following to install impute
## (try http:// if https:// URLs are not supported)
## source("https://bioconductor.org/biocLite.R")
## biocLite("impute")
## useDynLib(CVR, .registration = TRUE)
#' @useDynLib CVR
#' @import Rcpp
#' @import graphics
#' @import stats
#' @title Sparse canonical correlation analysis.
#'
#' @description Get sparse CCA solutions of X1 and X2. Use \code{CCA} and \code{CCA.permute} from PMA package.
#' See PMA package for details.
#'
#' @usage SparseCCA(X1, X2, rank = 2, penaltyx1 = NULL, penaltyx2 = NULL,
#' nperms = 25, ifplot = 0)
#' @param X1,X2 Numeric matrices representing the two sets of covariates. They should have the same number of rows and
#' cannot contain missing values.
#' @param rank The number of canonical variate pairs wanted. The default is 2.
#' @param penaltyx1,penaltyx2 Numeric vectors as the penalties to be applied to X1 and X2. The defaults are seq(0.1, 0.7, len = 20).
#' See PMA package for details.
#' @param nperms Number of times the data should be permuted. The default is 25. Don't use too small value. See PMA package for details.
#' @param ifplot 0 or 1. The default is 0 which means don't plot the result. See PMA package for details.
#' @details This function is generally used for tuning the penalties in sparse CCA if \code{penaltyx1} and \code{penaltyx2} are supplied as vectors.
#' The CCA solution is based on PMD algorithm and the tuning is based on permutation.
#' The fitted W1 and W2 are scaled so that the diagonals of W1'X1'X1W1 and W2'X2'X2W2 are all 1's.
#'
#' Specifically, if a single value of mild penalty is provided for both \code{penaltyx1} and \code{penaltyx2}, the result can be used as
#' initial values of W's in CVR. For instance, with \code{penaltyx1 = 0.7} and \code{penaltyx2 = 0.7},
#' the fitted W1 and W2 are only shrinked, but mostly not zero yet.
#' @return \item{W1}{Loading matrix corresponding to X1. X1*W1 gives the canonical variates from X1.}
#' @return \item{W2}{Loading matrix corresponding to X2. X2*W2 gives the canonical variates from X2.}
#' @author Chongliang Luo, Kun Chen.
#' @references Daniela M. Witten, Robert Tibshirani and Trevor Hastie (2009) A penalized matrix decomposition, with applications to
#' sparse principal components and canonical correlation analysis. Biostatistics 10(3), 515-534.
#'
#' Chongliang Luo, Jin Liu, Dipak D. Dey and Kun Chen (2016) Canonical variate regression.
#' Biostatistics, doi: 10.1093/biostatistics/kxw001.
#' @seealso \code{\link{CVR}}, \code{\link{CCA}}, \code{\link{CCA.permute}}.
#' @import PMA
#' @export
SparseCCA <- function(X1, X2, rank = 2, penaltyx1 = NULL, penaltyx2 = NULL, nperms = 25, ifplot = 0){
if (nrow(X1) != nrow(X2)) stop("X1 and X2 should have the same row number!")
if (ncol(X1) == 1 | ncol(X2) == 1) stop("X1 and X2 should have at least 2 columns!")
if (is.null(penaltyx1) | is.null(penaltyx2)){
penaltyx1 <- seq(0.1, 0.7, len = 20)
penaltyx2 <- seq(0.1, 0.7, len = 20)
}
if (length(penaltyx1) > 1){
perm.out <- CCA.permute(X1, X2, "standard", "standard", penaltyx1, penaltyx2, nperms = nperms)
if (ifplot == 1 ) {
print(c(perm.out$bestpenaltyx, perm.out$bestpenaltyz))
plot(perm.out)
}
SparseCCA_X12 <- CCA(X1, X2, typex = "standard", typez = "standard", K = rank,
penaltyx = perm.out$bestpenaltyx, penaltyz = perm.out$bestpenaltyz,
v = perm.out$v.init, trace = FALSE)
} else {
SparseCCA_X12 <- CCA(X1, X2, typex = "standard", typez = "standard", K = rank,
penaltyx = penaltyx1, penaltyz = penaltyx2, trace = FALSE)
}
D1 <- diag(diag(t(X1 %*% SparseCCA_X12$u) %*% X1 %*% SparseCCA_X12$u)^(-0.5), rank, rank)
D2 <- diag(diag(t(X2 %*% SparseCCA_X12$v) %*% X2 %*% SparseCCA_X12$v)^(-0.5), rank, rank)
W1 <- SparseCCA_X12$u %*% D1
W2 <- SparseCCA_X12$v %*% D2 ## to make X1 %*% W1 orthogonal
return(list(W1 = W1, W2 = W2))
}
#' @title Generate simulation data.
#'
#' @description Generate two sets of covariates and an univariate response driven by several latent factors.
#'
#' @usage SimulateCVR(family = c("gaussian", "binomial", "poisson"), n = 100,
#' rank = 4, p1 = 50, p2 = 70, pnz = 10, sigmax = 0.2,
#' sigmay = 0.5, beta = c(2, 1, 0, 0), standardization = TRUE)
#'
#' @param family Type of response. \code{"gaussian"} for continuous response, \code{"binomial"}
#' for binary response, and \code{"poisson"} for Poisson response. The default is \code{"gaussian"}.
#' @param n Number of rows. The default is 100.
#' @param rank Number of latent factors generating the covariates. The default is 4.
#' @param p1 Number of variables in X1. The default is 50.
#' @param p2 Number of variables in X2. The default is 70.
#' @param pnz Number of variables in X1 and X2 related to the signal. The default is 10.
#' @param sigmax Standard deviation of normal noise in X1 and X2. The default is 0.2.
#' @param sigmay Standard deviation of normal noise in Y. Only used when the response is Gaussian. The default is 0.5.
#' @param beta Numeric vector, the coefficients used to generate respose from the latent factors. The default is c(2, 1, 0, 0).
#' @param standardization Logical. If TRUE, standardize X1 and X2 before output. The default is TRUE.
#'
#' @details The latent factors in U are randomly generated normal vectors,
#'
#' \eqn{X_1 = U*V_1 + \sigma_x*E_1, X_2 = U*V_2 + \sigma_x*E_2, E_1, E_2}{%
#' X1 = U*V1 + sigmax*E1, X2 = U*V2 + sigmax*E2, E1, E2} are N(0,1) noise matrices.
#'
#' The nonzero entries of \eqn{V_1}{%
#' V1} and \eqn{V_2}{%
#' V2} are generated from Uniform([-1,-0.5]U[0.5,1]).
#'
#' For Gaussian response,
#'
#' \eqn{y = U*\beta + \sigma_y*e_y, e_y}{%
#' y = U*\beta + sigmay*ey, ey} is N(0,1) noise vector,
#'
#' for binary response,
#'
#' \eqn{y \sim rbinom(n, 1, 1/(1 + \exp(-U*\beta)))},
#'
#' and for Poisson response,
#'
#' \eqn{y \sim rpois(n, \exp(U*\beta))}.
#'
#' See the reference for more details.
#'
#' @return \item{X1, X2}{The two sets of covariates with dimensions n*p1 and n*p2 respectively.}
#' @return \item{y}{The response vector with length n.}
#' @return \item{U}{The true latent factor matrix with dimension n*rank.}
#' @return \item{beta}{The coefficients used to generate response from \code{U}. The length is rank.}
#' @return \item{V1, V2}{The true loading matrices for X1 and X2 with dimensions p1*rank and p2*rank. The first \code{pnz} rows are nonzero.}
#' @author Chongliang Luo, Kun Chen.
#' @references Chongliang Luo, Jin Liu, Dipak D. Dey and Kun Chen (2016) Canonical variate regression.
#' Biostatistics, doi: 10.1093/biostatistics/kxw001.
#' @seealso \code{\link{CVR}}, \code{\link{cvrsolver}}.
#' @examples
#' set.seed(42)
#' mydata <- SimulateCVR(family = "g", n = 100, rank = 4, p1 = 50, p2 = 70,
#' pnz = 10, beta = c(2, 1, 0, 0))
#' X1 <- mydata$X1
#' X2 <- mydata$X2
#' Xlist <- list(X1 = X1, X2 = X2);
#' Y <- mydata$y
#' opts <- list(standardization = FALSE, maxIters = 300, tol = 0.005)
#' ## use sparse CCA solution as initial values, see SparseCCA()
#' Wini <- SparseCCA(X1, X2, 4, 0.7, 0.7)
#' ## perform CVR with fixed eta and lambda, see cvrsolver()
#' fit <- cvrsolver(Y, Xlist, rank = 4, eta = 0.5, Lam = c(1, 1),
#' family = "gaussian", Wini, penalty = "GL1", opts)
#' ## check sparsity recovery
#' fit$W[[1]];
#' fit$W[[2]];
#' ## check orthogonality
#' X1W1 <- X1 %*% fit$W[[1]];
#' t(X1W1) %*% X1W1
#' @export
SimulateCVR <- function(family = c("gaussian", "binomial", "poisson"), n = 100, rank = 4,
p1 = 50, p2 = 70, pnz = 10, sigmax = 0.2, sigmay = 0.5,
beta = c(2, 1, 0, 0), standardization = TRUE){
family <- match.arg(family)
U <- matrix(rnorm(2 * n * rank), 2 * n, rank);
## U = svd(U)$u*sqrt(n); # n by r
V1 <- matrix(0, p1, rank);
V11 <- matrix(runif(rank * pnz) * (rbinom(rank * pnz, 1, 0.5) * 2-1), ncol = rank);
V2 <- matrix(0, p2, rank);
V21 <- matrix(runif(rank * pnz) * (rbinom(rank * pnz, 1, 0.5) * 2 - 1), ncol = rank);
V1[1:pnz, ] <- svd(V11)$u;
V2[1:pnz, ] <- svd(V21)$u;
x1 <- U %*% t(V1);
x2 <- U %*% t(V2);
lp <- U %*% beta;
e1 <- matrix(rnorm(2 * n * p1), 2 * n, p1) * sigmax;
e2 <- matrix(rnorm(2 * n * p2), 2 * n, p2) * sigmax;
X1 <- x1[1:n, ] + e1[1:n, ];
X2 <- x2[1:n, ] + e2[1:n, ];
X1test <- x1[(n + 1):(2 * n), ] + e1[(n + 1):(2 * n), ];
X2test <- x2[(n + 1):(2 * n), ] + e2[(n + 1):(2 * n), ];
U <- U[1:n, ]
if (family == "gaussian") {
e <- rnorm(2 * n) * sigmay;
y <- lp[1:n] + e[1:n];
ytest <- lp[(n + 1):(2 * n)] + e[(n + 1):(2 * n)];
if (standardization) {
V1 <- diag(apply(X1, 2, sd)) %*% V1 / sqrt(n - 1)
V2 <- diag(apply(X2, 2, sd)) %*% V2 / sqrt(n - 1);
y <- scale(y);
X1 <- scale(X1);
X2 <- scale(X2);#/sqrt(n-1)
ytest <- scale(ytest);
X1test <- scale(X1test);
X2test <- scale(X2test);#/sqrt(n-1)
}
snr.x1 <- sd(x1) / sd(e1)
snr.x2 <- sd(x2) / sd(e2)
snr.y <- sd(lp) / sd(e);
snr <- c(snr.x1, snr.x2, snr.y)
} else if (family == "binomial") {
px <- 1 / (1 + exp(-lp));
y <- rbinom(n, 1, px[1:n])
ytest <- rbinom(n, 1, px[(n + 1):(2 * n)])
if (standardization) {
V1 <- diag(apply(X1, 2, sd)) %*% V1 / sqrt(n - 1);
V2 <- diag(apply(X2, 2, sd)) %*% V2 / sqrt(n - 1);
X1 <- scale(X1);
X2 <- scale(X2);
X1test <- scale(X1test);
X2test <- scale(X2test);
}
snr.x1 <- sd(x1) / sd(e1)
snr.x2 <- sd(x2) / sd(e2)
snr <- c(snr.x1, snr.x2, 0)
} else if (family == "poisson") {
px <- exp(lp);
y <- rpois(n, px[1:n])
ytest <- rpois(n, px[(n + 1):(2 * n)])
if (standardization) {
V1 <- diag(apply(X1, 2, sd)) %*% V1 / sqrt(n - 1);
V2 <- diag(apply(X2, 2, sd)) %*% V2 / sqrt(n - 1);
X1 <- scale(X1);
X2 <- scale(X2);
X1test <- scale(X1test);
X2test <- scale(X2test);
}
snr.x1 <- sd(x1) / sd(e1)
snr.x2 <- sd(x2) / sd(e2)
snr <- c(snr.x1, snr.x2, 0)
}
## X1test = X1test, X2test = X2test, ytest = as.matrix(ytest), x1 = x1, x2 = x2, lp = lp, snr = snr
return(list(X1 = X1, X2 = X2, y = as.matrix(y), U = U, beta = beta, V1 = V1, V2 = V2))
}
## calculate area under ROC curve (AUC) and deviance in binomial case
CalcROC <- function(y, px, ifplot = 0){
## predict binomial response with given fitted prob px,
## compute AUC of ROC / (neg) deviance (use it instead of AUC, for small sample size)
## input:
## y: real response
## px: fitted prob seq
## output:
## list(spec = spec, sens = sens, auc = auc, dev = dev)
n <- length(px);
np <- sum(y == 1);
nn <- n - np;
sens <- c();
spec <- c();
if (np == 0) {
auc <- sum(px < 0.5) / nn;
} else if (nn == 0) {
auc <- sum(px > 0.5) / np;
} else {
for (i in 1:n) {
sens[i] <- (sum(px > px[i] & y == 1) + 0.5 * sum(px == px[i] & y == 1)) / np;
spec[i] <- (sum(px < px[i] & y == 0) + 0.5 * sum(px == px[i] & y == 0)) / nn;
}
}
if (ifplot == 1) plot(1 - spec, sens);
##auc = trapz(sort(1-spec),sort(sens));
xx <- sort(1 - spec, index.return = TRUE)
yy <- c(0, sens[xx$ix], 1)
xx <- c(0, xx$x, 1)
auc <- sum((xx[-1] - xx[-(n + 2)]) * (yy[-1] + yy[-(n + 2)])) / 2
dev <- -2 * sum(y * log(px + 0.00001) + (1 - y) * log(1 - px + 0.0001))
return(list(spec = spec, sens = sens, auc = auc, dev = dev))
}
## calculate the error of testing data (mse, auc or deviance) using fitted W1, W2, alpha, beta
PredCVR <- function(Ytest, X1test, X2test, W1, W2, alpha = 0, beta = 0, Y, X1, X2, family = "gaussian",
refit = TRUE, combine = TRUE, type.measure = NULL){
## compute the pred mse/AUC(deviance) of testing data
X1testW1 <- X1test %*% W1;
X2testW2 <- X2test %*% W2;
if (family == "gaussian") {
type.measure <- "mse";
} else if (family == "binomial") {
if(length(Ytest) > 10 & is.null(type.measure)) type.measure <- "auc";
if(length(Ytest) < 11 & is.null(type.measure)) type.measure <- "deviance";
} else if (family == "poisson") {
type.measure <- "deviance";
}
if (refit) {
X1W1 <- X1 %*% W1;
X2W2 <- X2 %*% W2;
XW <- cbind(X1W1, X2W2)
XtestW <- cbind(X1testW1, X2testW2)
if (family == "gaussian") {
## mse is the average of two predictions from X1testW1 and X2testW2
if (combine) {
ab <- coef(lm(Y ~ XW));
ab[is.na(ab)] <- 0
mse <- sum((Ytest - ab[1] - XtestW %*% ab[-1])^2) / length(Ytest)
} else {
ab1 <- coef(lm(Y ~ X1W1))
ab2 <- coef(lm(Y ~ X2W2))
ab1[is.na(ab1)] <- 0;
ab2[is.na(ab2)] <- 0;
mse <- (sum((Ytest - ab1[1] - X1testW1 %*% ab1[-1])^2)
+ sum((Ytest - ab2[1] - X2testW2 %*% ab2[-1])^2)) / length(Ytest) / 2
}
if (is.na(mse)) mse <- sum(Ytest^2) / length(Ytest) # for empty W1 and W2
return(mse)
} else if (family == "binomial") {
if (sum(abs(W1)) == 0 | sum(abs(W2)) == 0) { ## avoid fitting glm with zero predictors
if (type.measure == "auc") {
return(0)
} else if (type.measure == "deviance") {
dev <- -2 * length(Ytest) * log(0.00001)
return(dev)
}
} else {
if (combine) {
refit12 <- glm(Y ~ XW, family = "binomial")
ab <- coef(refit12)
ab[is.na(ab)] <- 0
px <- 1 / (1 + exp(-XtestW %*% ab[-1] - ab[1]))
if (type.measure == "auc") {
auc <- CalcROC(Ytest, px)$auc;
return(auc)
} else if (type.measure == "deviance") {
dev <- CalcROC(Ytest, px)$dev;
return(dev)
}
} else {
refit1 <- glm(Y ~ X1W1, family = "binomial")
refit2 <- glm(Y ~ X2W2, family = "binomial")
ab1 <- coef(refit1)
ab2 <- coef(refit2)
ab1[is.na(ab1)] <- 0
ab2[is.na(ab2)] <- 0
px1 <- 1 / (1 + exp(-X1testW1 %*% ab1[-1] - ab1[1]))
px2 <- 1 / (1 + exp(-X2testW2 %*% ab2[-1] - ab2[1]))
if (type.measure == "auc") {
auc <- (CalcROC(Ytest, px1)$auc + CalcROC(Ytest, px2)$auc) / 2
return(auc)
} else if (type.measure == "deviance") {
dev <- (CalcROC(Ytest, px1)$dev + CalcROC(Ytest, px2)$dev) / 2
return(dev)
}
}
}
} else if (family == "poisson") {
if (sum(abs(W1)) == 0 | sum(abs(W2)) == 0) {
dev <- -2 * length(Ytest) * log(0.000001)
return(dev)
} else {
if (combine) {
refit12 <- glm(Y ~ XW, family = "poisson")
ab <- coef(refit12)
ab[is.na(ab)] <- 0
px <- exp(XtestW %*% ab[-1] + ab[1])
dev <- -2 * sum(Ytest * log(px) - px)
return(dev)
} else {
refit1 <- glm(Y ~ X1W1, family = "poisson")
refit2 <- glm(Y ~ X2W2, family = "poisson")
ab1 <- coef(refit1);
ab2 <- coef(refit2)
ab1[is.na(ab1)] <- 0;
ab2[is.na(ab2)] <- 0
px1 <- exp(X1testW1 %*% ab1[-1] + ab1[1])
px2 <- exp(X2testW2 %*% ab2[-1] + ab2[1])
dev <- -sum(Ytest * log(px1) - px1) - sum(Ytest * log(px2) - px2)
return(dev)
}
}
}
} else {
if (family == "gaussian") {
lp1 <- alpha + X1testW1 %*% beta
lp2 <- alpha + X2testW2 %*% beta
mse <- sum((Ytest - lp1)^2 + (Ytest - lp2)^2) / length(Ytest) / 2
return(mse)
} else if (family == "binomial") {
if (sum(abs(W1)) == 0 | sum(abs(W2)) == 0) {
if (type.measure == "auc") {
return(0)
} else if (type.measure == "deviance") {
dev <- -2 * length(Ytest) * log(0.00001)
return(dev)
}
} else {
lp1 <- alpha + X1testW1 %*% beta
lp2 <- alpha + X2testW2 %*% beta
px1 <- 1 / (1 + exp(-lp1))
px2 <- 1 / (1 + exp(-lp2))
if (type.measure == "auc") {
auc <- (CalcROC(Ytest, px1)$auc + CalcROC(Ytest, px2)$auc) / 2
return(auc)
} else if (type.measure == "deviance") {
dev <- (CalcROC(Ytest, px1)$dev + CalcROC(Ytest, px2)$dev) / 2
return(dev)
}
}
} else if (family == "poisson") {
if (sum(abs(W1)) == 0 | sum(abs(W2)) == 0) {
dev <- -2 * length(Ytest) * log(0.000001)
return(dev)
} else {
lp1 <- alpha + X1testW1 %*% beta
lp2 <- alpha + X2testW2 %*% beta
px1 <- exp(lp1)
px2 <- exp(lp2)
dev <- -sum(Ytest * log(px1) - px1) - sum(Ytest * log(px2) - px2)
return(dev)
}
}
}
}
## calculate solution path of CVR (along the sequence of lambda)
CVRPath <- function(Y, Xlist, rank = 2, eta = 0.5, Lamseq,
family = c("gaussian", "binomial", "poisson"),
Wini = NULL, penalty = c("GL1", "L1"), opts){
family <- match.arg(family)
penalty <- match.arg(penalty)
Y <- as.matrix(Y)
K <- length(Xlist)
if (dim(Lamseq)[2] != K)
stop("The length of Lambda is not the same to K!");
nlam <- dim(Lamseq)[1]
family <- match.arg(family)
penalty <- match.arg(penalty)
if (is.null(Wini))
Wini <- SparseCCA(Xlist[[1]], Xlist[[2]], rank, 0.7, 0.7)
p <- c()
WPath <- list()
for (k in 1:K) {
p[k] <- dim(Xlist[[k]])[2]
WPath[[k]] <- array(0, c(nlam, p[k], rank));
}
betaPath <- matrix(0, nlam, rank)
alphaPath <- rep(0, nlam)
iterPath <- rep(0, nlam)
fitwork <- list()
warm <- TRUE
Wwork <- Wini
for (i in 1:nlam){
if (i == 1) {
optswork <- opts;
optswork$maxIters <- 2 * opts$maxIters;
} else {
optswork <- opts;
if (warm)
Wwork <- fitwork$W; ## warm start
}
fitwork <- cvrsolver(Y, Xlist, rank, eta, as.vector(Lamseq[i, ]), family, Wwork, penalty, optswork);
Wzero <- 0
for (k in 1:K)
Wzero <- Wzero + sum(fitwork$W[[k]]!=0);
if (Wzero == 0)
break;
for (k in 1:K)
WPath[[k]][i, , ] <- fitwork$W[[k]];
betaPath[i, ] <- fitwork$beta;
alphaPath[i] <- fitwork$alpha;
iterPath[i] <- fitwork$iter
}
return(list(WPath = WPath, betaPath = betaPath, alphaPath = alphaPath, iterPath = iterPath))
}
## cross validate to select lambda, with fixed rank and eta
TuneCVR <- function(Y, Xlist, rank, eta, Lamseq = NULL,
family = c("gaussian", "binomial", "poisson"),
Wini = NULL, penalty = c("GL1", "L1"),
nfold = 10, foldid = NULL, type.measure = NULL, opts){
## cross-validate to select lam's
## input: Lamseq: nlam by K, K=2
## output: list(Lamseq = Lamseq, cverror = pred, cvm = mpred, msparse = msparse,
## Lamhat = Lamhat, W1trace = W1trace, W2trace = W2trace, cvr.fit = fit,
## alpha = alpha.refit, beta = beta.refit, type.measure = type.measure)
## cvr.fit: Refit all the data using the selected parameters,
## see the output of cvrsolver(). Also output the coefficients of regressing
## the response to the combined fitted canonical variates (Y ~ cbind(X1W1, X2W2)).
##
family <- match.arg(family)
penalty <- match.arg(penalty)
Y <- as.matrix(Y)
n <- dim(Y)[1]
K <- length(Xlist)
X1 <- as.matrix(Xlist[[1]])
X2 <- as.matrix(Xlist[[2]])
p1 <- ncol(X1)
p2 <- ncol(X2)
if (is.null(Wini))
Wini <- SparseCCA(X1, X2, rank, 0.7, 0.7)
if (family == "gaussian") {
type.measure <- "mse";
} else if (family == "binomial") {
if (n / nfold > 10 & is.null(type.measure)) type.measure <- "auc";
if (n / nfold < 11 & is.null(type.measure)) type.measure <- "deviance";
} else if (family == "poisson") {
type.measure <- "deviance";
}
warm <- TRUE;
opts$nrank <- rank
opts$family <- family
opts$penalty <- penalty
Lamseq <- as.matrix(Lamseq)
if (dim(Lamseq)[2] != K) {
lamseq <- 10^(seq(-2, 1.3, len = 50))
Lamseq <- cbind(lamseq, lamseq * p2 / p1)
}
nlam <- dim(Lamseq)[1];
pred <- matrix(0, nlam, nfold);
itertrace <- matrix(0, nlam, nfold)
W1trace <- array(0, c(nlam, nfold, p1, rank))
W2trace <- array(0, c(nlam, nfold, p2, rank))
sparse <- matrix(0, nlam, nfold) # monitor sparsity
if (is.null(foldid)) {
tmp <- ceiling(c(1:n) / (n / nfold));
foldid <- sample(tmp, n)
}
for (i in 1:nfold) {
X1train <- Xlist[[1]][foldid != i, ];
X2train <- Xlist[[2]][foldid != i, ];
Xtrain <- list(X1 = X1train, X2 = X2train);
Ytrain <- as.matrix(Y[foldid != i, ]);
X1test <- Xlist[[1]][foldid == i, ];
X2test <- Xlist[[2]][foldid == i, ];
Ytest <- as.matrix(Y[foldid == i, ]);
if (is.null(opts$W))
opts$W = SparseCCA(X1train, X2train, rank, 0.7, 0.7)
obj <- CVRPath(Ytrain, Xtrain, rank, eta, Lamseq, family, opts$W, penalty, opts);
W1trace[, i, , ] <- obj$WPath[[1]]
W2trace[, i, , ] <- obj$WPath[[2]]
itertrace[, i] <- obj$iterPath
for (j in 1:nlam){
W1 <- obj$WPath[[1]][j, , ]
W2 <- obj$WPath[[2]][j, , ]
## sparse = proportion of nonzero elements among W1 and W2
sparse[j, i] <- (sum(W1 != 0) + sum(W2 != 0)) / (length(W1) + length(W2))
alpha <- as.numeric(obj$alphaPath[j])
beta <- obj$betaPath[j, ]
pred[j, i] <- PredCVR(Ytest, X1test, X2test, W1, W2, alpha, beta,
Ytrain, X1train, X2train, family = family,
refit = TRUE, type.measure = type.measure)
}
}
mpred <- apply(pred, 1, mean)
msparse <- apply(sparse, 1, mean)
spthresh <- opts$spthresh
if (type.measure == "mse" | type.measure == "deviance") {
## search the best lam only among sparse models
ilam <- which.min(mpred[msparse <= spthresh]) + min(which(msparse <= spthresh)) - 1
ifold <- which.min(pred[ilam, ])
mmpred <- min(mpred[msparse <= spthresh])
} else if (type.measure == "auc") {
ilam <- which.max(mpred[msparse <= spthresh]) + min(which(msparse <= spthresh)) - 1
ifold <- which.max(pred[ilam, ])
mmpred <- max(mpred[msparse <= spthresh])
}
Lamhat <- Lamseq[ilam, ]
## refit all the data with Lamhat
if (warm) {
Wini <- list(W1 = as.matrix(W1trace[max(ilam - 1, 1), ifold, , ]),
W2 = as.matrix(W2trace[max(ilam - 1, 1), ifold, , ]))
}
fit <- cvrsolver(Y, Xlist, rank, eta, Lamhat, family, Wini, penalty, opts)
XW <- cbind(X1 %*% fit$W[[1]], X2 %*% fit$W[[2]])
if (family == "gaussian") {
ab <- coef(lm(Y ~ XW));
ab[is.na(ab)] <- 0
alpha.refit <- ab[1]
beta.refit <- ab[-1]
} else {
ab <- coef(glm(Y ~ XW, family = family))
ab[is.na(ab)] <- 0
alpha.refit <- ab[1]
beta.refit <- ab[-1]
}
refit <- list(alpha = alpha.refit, beta = beta.refit, W = fit$W)
cv.out <- list(Lamseq = Lamseq, eta = eta, rank = rank,
cverror = pred, cvm = mpred, mmpred = mmpred, msparse = msparse,
Lamhat = Lamhat, W1trace = W1trace, W2trace = W2trace, cvr.fit = fit,
refit = refit, type.measure = type.measure, spthresh = spthresh)
class(cv.out) <- "TuneCVR"
return(cv.out)
}
## plot mean cv error and sparsity of the solution path from TuneCVR objects
plot.TuneCVR <- function(x) {
par(mfrow = c(2, 1))
plot(log(x$Lamseq[, 1]), x$cvm, xlab="",
ylab = x$type.measure, type = "l", col = "red",
main = paste("CVR(eta = ", round(x$eta, 5), ", rank = ", x$rank, ")", sep=""))
abline(v = log(x$Lamhat[1]))
plot(log(x$Lamseq[, 1]), x$msparse,
xlab = expression(log ~ lambda),
ylab = "sparsity", type = "l")
abline(h = x$spthresh)
}
#' @title Fit canonical variate regression with tuning parameters selected by cross validation.
#'
#' @description This function fits the solution path of canonical variate regression,
#' with tuning parameters selected by cross validation. The tuning parameters
#' include the rank, the \eqn{\eta} and the \eqn{\lambda}.
#' @usage
#' CVR(Y, Xlist, rankseq = 2, neta = 10, etaseq = NULL, nlam = 50,
#' Lamseq = NULL, family = c("gaussian", "binomial", "poisson"),
#' Wini = NULL, penalty = c("GL1", "L1"), nfold = 10, foldid = NULL,
#' opts = list(), type.measure = NULL)
#'
#' @param Y A univariate response variable.
#' @param Xlist A list of two covariate matrices as in \code{cvrsolver}.
#' @param rankseq A sequence of candidate ranks. The default is a single value 2.
#' @param neta Number of \eqn{\eta} values. The default is 10.
#' @param etaseq A sequence of length \code{neta} containing candidate \eqn{\eta} values between 0 and 1.
#' The default is 10^seq(-2, log10(0.9), length = neta).
#' @param nlam Number of \eqn{\lambda} values. The default is 50.
#' @param Lamseq A matrix of \eqn{\lambda} values. The column number is the number of sets in \code{Xlist},
#' and the row number is \code{nlam}. The default is 10^(seq(-2, 2, length = nlam)) for each column.
#' @param family Type of response as in \code{cvrsolver}. The default is \code{"gaussian"}.
#' @param Wini A list of initial loading W's. The default is from the SparseCCA solution. See \code{SparseCCA}.
#' @param penalty Type of penalty on loading matrices W's as in \code{cvrsolver}. The default is \code{"GL1"}.
#' @param nfold Number of folds in cross validation. The default is 10.
#' @param foldid Specifying training and testing sets in cross validation; random generated if not supplied.
#' It remains the same across different rank and \eqn{\eta}.
#' @param opts A list of options for controlling the algorithm. The default of \code{opts$spthresh} is 0.4, which means
#' we only search sparse models with at most 40\% nonzero entries in W1 and W2. See the other options
#' (\code{standardization}, \code{maxIters} and \code{tol}) in \code{cvrsolver}.
#' @param type.measure Type of measurement used in cross validation. \code{"mse"} for Gaussian, \code{"auc"} for binomial,
#' and \code{"deviance"} for binomial and Poisson.
#'
#' @details In this function, the rank, \eqn{\eta} and \eqn{\lambda} are tuned by cross validation. CVR then is refitted with
#' all data using the selected tuning parameters. The \code{plot} function shows the tuning of \eqn{\lambda},
#' with selected rank and \eqn{\eta}.
#'
#' @return An object with S3 class "CVR" containing the following components
#' @return \item{cverror}{A matrix containing the CV errors. The number of rows is the length
#' of \code{etaseq} and the number of columns is the length of \code{rankseq}.}
#' @return \item{etahat}{Selected \eqn{\eta}.}
#' @return \item{rankhat}{Selected rank.}
#' @return \item{Lamhat}{Selected \eqn{\lambda}'s.}
#' @return \item{Alphapath}{An array containing the fitted paths of the intercept term \eqn{\alpha}.}
#' @return \item{Betapath}{An array containing the fitted paths of the regression coefficient \eqn{\beta}.}
#' @return \item{W1path, W2path}{Arrays containing the fitted paths of W1 and W2.}
#' @return \item{foldid}{\code{foldid} used in cross validation.}
#' @return \item{cvout}{Cross validation results using selected \eqn{\eta} and rank.}
#' @return \item{solution}{A list including the solutions of \eqn{\alpha}, \eqn{\beta}, W1 and W2, by refitting all the data
#' using selected tuning parameters.}
#'
#' @author Chongliang Luo, Kun Chen.
#'
#' @references Chongliang Luo, Jin Liu, Dipak D. Dey and Kun Chen (2016) Canonical variate regression.
#' Biostatistics, doi: 10.1093/biostatistics/kxw001.
#' @seealso \code{\link{cvrsolver}}, \code{\link{SparseCCA}}, \code{\link{SimulateCVR}}.
#' @examples
#' ############## Gaussian response ######################
#' set.seed(42)
#' mydata <- SimulateCVR(family = "g", n = 100, rank = 4, p1 = 50, p2 = 70,
#' pnz = 10, beta = c(2, 1, 0, 0))
#' X1 <- mydata$X1;
#' X2 <- mydata$X2
#' Xlist <- list(X1 = X1, X2 = X2);
#' Y <- mydata$y
#' ## fix rank = 4, tune eta and lambda
#' ##out_cvr <- CVR(Y, Xlist, rankseq = 4, neta = 5, nlam = 25,
#' ## family = "g", nfold = 5)
#' ## out_cvr$solution$W[[1]];
#' ## out_cvr$solution$W[[2]];
#' ### uncomment to see plots
#' ## plot.CVR(out_cvr)
#' ##
#' ## Distance of subspaces
#' ##U <- mydata$U
#' ##Pj <- function(U) U %*% solve(t(U) %*% U, t(U))
#' ##sum((Pj(U) - (Pj(X1 %*% out_cvr$sol$W[[1]]) + Pj(X2 %*% out_cvr$sol$W[[2]]))/2)^2)
#' ## Precision/Recall rate
#' ## the first 10 rows of the true W1 and W2 are set to be nonzero
#' ##W12 <- rbind(out_cvr$sol$W[[1]], out_cvr$sol$W[[1]])
#' ##W12norm <- apply(W12, 1, function(a)sqrt(sum(a^2)))
#' ##prec <- sum(W12norm[c(1:10, 51:60)] != 0)/sum(W12norm != 0); prec
#' ##rec <- sum(W12norm[c(1:10, 51:60)] != 0)/20; rec
#' ## sequential SparseCCA, compare the Distance of subspaces and Prec/Rec
#' ##W12s <- SparseCCA(X1, X2, 4)
#' ## Distance larger than CVR's
#' ##sum((Pj(U) - (Pj(X1 %*% W12s$W1) + Pj(X2 %*% W12s$W2))/2)^2)
#' ##W12snorm <- apply(rbind(W12s$W1, W12s$W2), 1, function(a)sqrt(sum(a^2)))
#' ## compare Prec/Rec
#' ##sum(W12snorm[c(1:10, 51:60)] != 0)/sum(W12snorm != 0);
#' ##sum(W12snorm[c(1:10, 51:60)] != 0)/20;
#'
#' ############## binary response ########################
#' set.seed(12)
#' mydata <- SimulateCVR(family = "binomial", n = 300, rank = 4, p1 = 50,
#' p2 = 70, pnz = 10, beta = c(2, 1, 0, 0))
#' X1 <- mydata$X1; X2 <- mydata$X2
#' Xlist <- list(X1 = X1, X2 = X2);
#' Y <- mydata$y
#' ## out_cvr <- CVR(Y, Xlist, 4, neta = 5, nlam=25, family = "b", nfold = 5)
#' ## out_cvr$sol$W[[1]];
#' ## out_cvr$sol$W[[2]];
#' ## plot.CVR(out_cvr)
#'
#' ############## Poisson response ######################
#' set.seed(34)
#' mydata <- SimulateCVR(family = "p", n = 100, rank = 4, p1 = 50,
#' p2 = 70, pnz = 10, beta = c(0.2, 0.1, 0, 0))
#' X1 <- mydata$X1; X2 <- mydata$X2
#' Xlist <- list(X1 = X1, X2 = X2);
#' Y <- mydata$y
#' ## etaseq <- 10^seq(-3, log10(0.95), len = 10)
#' ## out_cvr <- CVR(Y, Xlist, 4, neta = 5, nlam = 25, family = "p", nfold = 5)
#' ## out_cvr$sol$W[[1]];
#' ## out_cvr$sol$W[[2]];
#' ## plot.CVR(out_cvr)
#' @export
CVR <- function(Y, Xlist, rankseq = 2, neta = 10, etaseq = NULL, nlam = 50,
Lamseq = NULL, family = c("gaussian", "binomial", "poisson"),
Wini = NULL, penalty = c("GL1", "L1"), nfold = 10, foldid = NULL,
opts = list(), type.measure = NULL){
family <- match.arg(family)
penalty <- match.arg(penalty)
if(is.null(family))
stop('Must specify family of the response!')
if(is.null(penalty)) {
penalty = "GL1";
cat('Use default: penalty = "GL1".\n')
}
Y <- as.matrix(Y)
X1 <- as.matrix(Xlist[[1]])
X2 <- as.matrix(Xlist[[2]])
Xlist <- list(X1 = X1, X2 = X2)
n <- dim(Y)[1]
p1 <- ncol(X1)
p2 <- ncol(X2)
if(p1 == 1 | p2 == 1)
stop("X1 and X2 should have at least 2 columns!")
if(nrow(X1) != nrow(X2))
stop("X1 and X2 should have the same row number!")
if(n != nrow(X1))
stop("Y and X1, X2 should have the same row number!")
opts$n <- n
opts$p1 <- p1
opts$p2 <- p2
mrank <- max(rankseq)
if (is.null(Wini)) {
Wini <- SparseCCA(X1, X2, mrank, 0.7, 0.7);
cat('Use default: initial W = SparseCCA(X1, X2, rank, 0.7, 0.7).\n')
}
if (is.null(opts$standardization)) {
opts$standardization <- TRUE;
#cat('Use default: standardization = TRUE.\n')
}
if (is.null(opts$maxIters)) {
opts$maxIters <- 300;
#cat('Use default: maxIters = 300.\n')
}
if (is.null(opts$tol)) {
opts$tol <- 0.01;
#cat('Use default: tol = 0.01.\n')
}
if (is.null(opts$spthresh)) {
spthresh <- 0.4
opts$spthresh = spthresh;
#cat('Use default: spthresh = 0.4.\n')
}
if (is.null(etaseq)) {
if(is.null(neta))
neta <- 10
etaseq <- 10^seq(-2, log10(0.9), len = neta)
}
if (is.null(Lamseq)) {
if (is.null(nlam))
nlam <- 50
lamseq <- 10^(seq(-2, 2, len = nlam))
Lamseq <- cbind(lamseq, lamseq)
}
if (is.null(foldid)) {
if(is.null(nfold))
nfold <- 10
tmp <- ceiling(c(1:n)/(n/nfold));
foldid <- sample(tmp, n)
} else {
if(nfold != length(unique(foldid))){
#warning("nfold not equal to no. unique values in foldid")
nfold <- length(unique(foldid))
}
}
if (family == "gaussian") {
type.measure <- "mse";
} else if (family == "binomial") {
if (n / nfold > 10 & is.null(type.measure))
type.measure <- "auc";
if (n / nfold < 11 & is.null(type.measure))
type.measure <- "deviance";
} else if (family == "poisson") {
type.measure = "deviance";
}
Lr <- length(rankseq)
Leta <- length(etaseq)
pred <- matrix(0, Leta, Lr)
Lamhat <- matrix(0, Leta, Lr)
Alphapath <- matrix(0, Leta, Lr)
Betapath <- array(0, c(Leta, Lr, mrank))
W1path <- array(0, c(Leta, Lr, p1, mrank))
W2path <- array(0, c(Leta, Lr, p2, mrank))
for (ir in 1:Lr) {
for (ieta in 1:Leta) {
obj_cv <- TuneCVR(Y, Xlist, rankseq[ir], etaseq[ieta], Lamseq, family, Wini, penalty,
nfold = nfold, foldid = foldid, type.measure = type.measure, opts)
pred[ieta, ir] <- obj_cv$mmpred
Lamhat[ieta, ir] <- obj_cv$Lamhat[1]
Alphapath[ieta, ir] <- obj_cv$cvr.fit$alpha
Betapath[ieta, ir, 1:rankseq[ir]] <- obj_cv$cvr.fit$beta
W1path[ieta, ir, , 1:rankseq[ir]] <- obj_cv$cvr.fit$W[[1]]
W2path[ieta, ir, , 1:rankseq[ir]] <- obj_cv$cvr.fit$W[[2]]
}
}
if (type.measure == "mse" | type.measure == "deviance") {
ind <- apply(pred, 2, which.min)
ind2 <- which.min(apply(pred, 2, min))
mmpred <- min(pred)
} else if (type.measure == "auc") {
ind <- apply(pred, 2, which.max)
ind2 <- which.max(apply(pred, 2, max))
mmpred <- max(pred)
}
etahat <- etaseq[ind[ind2]]
rankhat <- rankseq[ind2]
cvout <- TuneCVR(Y, Xlist, rankhat, etahat, Lamseq, family, Wini,
penalty, nfold, foldid = foldid, type.measure, opts)
cvrout <- list(cverror = pred, etahat = etahat, rankhat = rankhat, Lamhat = Lamhat,
Alphapath = Alphapath, Betapath = Betapath, W1path = W1path,
W2path = W2path, foldid = foldid, cvout = cvout,
solution = append(cvout$refit, list(rankseq = rankseq, etaseq = etaseq, Lamseq = Lamseq)))
class(cvrout) <- "CVR"
return(cvrout)
}
#' @title Plot a CVR object.
#'
#' @description Plot the tuning of CVR
#'
#' @usage
#' \method{plot}{CVR}(x, ...)
#'
#' @param x A CVR object.
#' @param ... Other graphical parameters used in plot.
#'
#' @details The first plot is mean cv error vs log(\eqn{\lambda}). The type of mean cv error is
#' decided by \code{type.measure} (see parameters of \code{CVR}). The selected \eqn{\lambda} is marked
#' by a vertical line in the plot. The second plot is sparsity vs log(\eqn{\lambda}).
#' Sparsity is the proportion of non-zero elements in fitted W1 and W2.
#' The threshold is marked by a horizontal line.
#' Press ENTER to see the second plot, which shows the tuning of \eqn{\eta}.
#' @export
plot.CVR <- function(x, ...) {
plot.TuneCVR(x$cvout)
cat ("Press [enter] to continue")
line <- readline()
par(mfrow = c(1, 1))
plot(x$cverror[, which(x$solution$rankseq == x$rankhat)] ~ log10(x$solution$etaseq),
xlab = "log10(etaseq)", ylab = x$cvout$type.measure,
main = "CVR(tune eta)")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.