#' Nonparametric Canonical Correlation Analysis
#'
#' @description Compute the Nonparametric canonical correlations between two data matrices
#'
#' @importFrom FNN get.knnx()
#' @importFrom Matrix spMatrix(), Diagonal()
#' @importFrom irlba irlba()
#'
#' @param X The training data set containing n samples and p features
#' @param Y The paired training data set containing n samples and q features
#' @param XV The validation data set containing nv samples and p features
#' @param YV The paired validation data set containing nv samples and q features
#' @param XT The testing data set containing nt samples and p features
#' @param YT The paired testing data set containing nt samples and q features
#' @param d The dimension of the ouput transformed features
#' @param hx The bandwidth parameters for the KDEs of X
#' @param hy The bandwidth parameters for the KDEs of Y
#' @param nx The number of nearest neighbors for the KDEs of X
#' @param ny The number of nearest neighbors for the KDEs of Y
#' @param PreComputedNNs The name of the .Rda file containing two n*nx matrices for the nearest neighbor indice and the nearest neighbor Euclidean distances for X, and two n*ny matrices for the nearest neighbor indice and the nearest neighbor Euclidean distances for Y. It is optional, only for debugging purposes.
#' @param ...
#'
#' @return X_new The d-dimensional projections of the training data X
#' @return Y_new The d-dimensional projections of the training data Y
#' @return XV_new The d-dimensional projections of the validation data X
#' @return YV_new The d-dimensional projections of the validation data Y
#' @return XT_new The d-dimensional projections of the testing data X
#' @return YT_new The d-dimensional projections of the testing data Y
#' @return The canonical correlation between X_new and Y_new
#' @return The canonical correlation between XV_new and YV_new
#' @return The canonical correlation between XT_new and YT_new
#' @return p value for X_new and V_new
#' @return p value XV_new and YV_new
#' @return p value XT_new and YT_new
#'
#' @export
#' @author Meiwen Jia, \email{meiwen_jia@psych.mpg.de} Ilaria Bonavita, \email{ilaria_bonavita@psych.mpg.de}
#' @references Michaeli, T., Wang, W., & Livescu, K. (2015). Nonparametric canonical correlation analysis. arXiv preprint arXiv:1511.04839.
#' @examples
#'
#' N <- 10000 # Overal number of examples (train+test)
#' N_paired <- 5000 # Number of training examples
#' MaxAngle <- 4*pi
#' MinRadius <-0.3
#' MaxRadius <- 8
#' NumNNs_X <- 20
#' NumNNs_Y <- 20
#' sx <- 0.5
#' sy <- 0.5
#' set.seed(8409)
#' ## Generate data for views 1,2
#' t <- seq(0, MaxAngle, length.out = N)
#' r <- seq(MinRadius, MaxRadius, length.out = N) + 2*runif(N)
#' #### generate X, the noise can be added!
#' X <- cbind(r*cos(t+0*rnorm(N)*0.05), r*sin(t+0*rnorm(N)*0.05))
#' X <- X + 0*matrix(rnorm(N*2), ncol = 2)
#' #### generate Y, the noise can be added!
#' Y <- cbind(t+0*rnorm(N)*1, 2*rnorm(N))
#' Y <- Y + 0*cbind(rep(0, N), rnorm(N))
#' ## Training data
#' PairedIndices <- sample(1:N, N_paired)
#' ## Test (or validation) data
#' UnpairedIndices <- setdiff(1:N,PairedIndices)
#' ncca_res <- ncca.rsvd(X[PairedIndices,],Y[PairedIndices,], X[UnpairedIndices,],Y[UnpairedIndices,],
#' d = 2, hx = 0.75, hy = 0.75, nx = NumNNs_X, ny=NumNNs_Y)
#' cat("The nonparametric canonical correlation between X and Y is ", ncca_res$cor_XY, "\n")
#'
ncca.rsvd <- function(X, # the paried training examples (rows) of features (columns)
Y, # the paried training examples (rows) of features (columns)
XV = NULL, # the paried validation examples
YV = NULL, # the paried validation examples
XT = NULL, # the paried test examples
YT = NULL, # the paried test examples
d, # the dimension of the output transformed features
hx = 1, # bandwith parameters for the KDEs of X
hy = 1, # bandwith parameters for the KDEs of Y
nx = 20, # number of nearest neighbours of X
ny = 20, # number of nearest neighbours of Y, it causes problem when nx != ny
PreComputedNNs = NULL, # only for debugging purposes
verbose = getOption("verbose"),
...){
# Create dataframe for saving times
#time.df <- as.data.frame(matrix(nrow = 1, ncol = 3 ))
#colnames(time.df) <- c('user','system','elapsed')
#t1 <- proc.time()
# ## remove the NAs in the matrices
if(anyNA(X)| anyNA(Y)){
remove.na <- unlist(apply(cbind(X,Y), 1, anyNA))
X <- X[!remove.na, , drop = F]
Y <- Y[!remove.na, , drop = F]
rm(remove.na)
}
if(anyNA(XV)| anyNA(YV)){
remove.na <- unlist(apply(cbind(XV,YV), 1, anyNA))
XV <- XV[!remove.na, , drop = F]
YV <- YV[!remove.na, , drop = F]
rm(remove.na)
}
if(anyNA(XT)| anyNA(YT)){
remove.na <- unlist(apply(cbind(XT,YT), 1, anyNA))
XT <- XT[!remove.na, , drop = F]
YT <- YT[!remove.na, , drop = F]
rm(remove.na)
}
# ## Set default values to the unspecified parameters
N <- nrow(X)
## Normalize data, using only training portion
NXV <- nrow(XV); NXT <- nrow(XT);
X <- rbind(X, XV, XT)
NX <- nrow(X)
rm(XV); rm(XT);
NYV <- nrow(YV); NYT <- nrow(YT);
Y <- rbind(Y, YV, YT)
NY <- nrow(Y)
rm(YV); rm(YT);
colmeansTr <- colMeans(X[1:N, , drop = F])
X <- t(t(X) - colmeansTr )
meanrowsum2Tr <- sqrt(mean(rowSums(X[1:N, , drop = F]^2)))
X <- X/meanrowsum2Tr
Y <- t(t(Y) - colMeans(Y[1:N, , drop = F]))
Y <- Y/sqrt(mean(rowSums(Y[1:N, , drop = F]^2)))
## compute NNs
if(is.null(PreComputedNNs)){
X_NNs <- FNN:::get.knnx(X[1:N, ], X, k = nx)
idxs_X <- t(X_NNs$nn.index)
dists_X <- t(X_NNs$nn.dist)
Y_NNs <- FNN:::get.knnx(Y[1:N, ], Y, k = ny)
idxs_Y <- t(Y_NNs$nn.index)
dists_Y <- t(Y_NNs$nn.dist)
rm(list = c("X_NNs","Y_NNs", "X", "Y"))
}else{
rm(list = c("X", "Y"))
load(PreComputedNNs)
}
gc()
## compute weight matrices for X and Y
colInd <- rep(c(1:NX), each = nx)
Dx <- Matrix::spMatrix(nrow = N, ncol = NX, i = as.vector(idxs_X), j = colInd, x = exp(-0.5*as.numeric(dists_X)/(hx^2)))
Dx <- Matrix::t(Dx)
colInd <- rep(c(1:NY), each = nx)
Dy <- Matrix::spMatrix(nrow = N, ncol = NY, i = as.vector(idxs_Y), j = colInd, x = exp(-0.5*as.numeric(dists_Y)/(hx^2)))
## normalize the weight matrices
Dx <- Matrix::Diagonal(NX, 1/Matrix::rowSums(Dx))%*%Dx
Dy <- Dy%*%Matrix::t(Matrix::Diagonal(NY, 1/Matrix::colSums(Dy)))
## Doubly stochastic normalization
## need some interpretation
S <- Dx %*% Dy
if(verbose) cat("Normalizing S to be doubly stochastic ...\n")
its <- 15 # get error in SVD when the number of iterations increasing
if(verbose) pb <- txtProgressBar(min = 0, max = its, style = 3)
for(i in 1:its){
S <- (NY/N) * Matrix::Diagonal(NY, 1/Matrix::rowSums(S)) %*% S
S <- S %*% Matrix::Diagonal(NX, 1/Matrix::colSums(S)) * (NX/N)
if(verbose) setTxtProgressBar(pb, i)
}
if(verbose) close(pb)
## compute projections using the eigendecomposition of the kernel
if(verbose) cat('Performing exact SVD ...\n')
#S_SVD <- svd(S, d+1, d+1)
#S_SVD <- irlba::irlba(S, d+1,fastpath = F) # a fast svd on a sparse matrix
S_SVD <- rsvd::rsvd(S,d+1)
X_proj <- S_SVD$u[,2:(d+1)] * sqrt(NX)
Y_proj <- S_SVD$v[,2:(d+1)] * sqrt(NY)
## output
X_new <- X_proj[1:N, ]
Y_new <- Y_proj[1:N, ]
cor_XY <- cancor(X_new, Y_new)$cor
#### add p-value (Ilaria)
p_XY <- CCP::p.asym(cor_XY, N, ncol(X_new), ncol(Y_new))
if(is.null(NXV) | is.null(NYV)){
XV_new <- YV_new <- cor_XVYV <- p_XVYV <- NULL
}else{
XV_new <- X_proj[N+(1:NXV), ]
YV_new <- Y_proj[N+(1:NYV), ]
cor_XVYV <- cancor(XV_new, YV_new)$cor
#### add p-value
p_XVYV <- CCP::p.asym(cor_XVYV, NXV, ncol(XV_new), ncol(YV_new))
}
if(is.null(NXT) | is.null(NYT)){
XT_new <- YT_new <- cor_XTYT <- p_XTYT <- NULL
}else{
XT_new <- X_proj[(N+NXV+1):NX, ]
YT_new <- Y_proj[(N+NYV+1):NY, ]
cor_XTYT <- cancor(XT_new, YT_new)$cor
#### add p-value
p_XTYT <- CCP::p.asym(cor_XTYT, NXT, ncol(XT_new), ncol(YT_new))
}
# time.df[1, ]<- (proc.time()- t1)[1:3]
# write.table(time.df,paste0(snp.dir,'/time.single_',svd.alg),append=T)
ncca_out <- list(colmeanX=colmeansTr, meanrowsumX=meanrowsum2Tr, Wx=Dx, Wy=Dy, X_new = X_new, Y_new = Y_new,
XV_new = XV_new, YV_new = YV_new,
XT_new = XT_new, YT_new = YT_new,
cor_XY = cor_XY, cor_XVYV = cor_XVYV, cor_XTYT = cor_XTYT, p_XY=p_XY, p_XVYV=p_XVYV, p_XTYT=p_XTYT)
return(ncca_out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.