Nothing
##################
### Kernel PCA ###
##################
### Kernel PCA and related functions
#' Kernel PCA
#'
#' @description
#'`kPCA()` computes the kernel PCA from a kernel matrix and, if desired, produces
#' a plot. The contribution of the original variables to the Principal Components (PCs),
#' sometimes referred as "loadings", is NOT returned (to do so, go to `kPCA_imp()`).
#'
#' @details As the ordinary PCA, kernel PCA can be used to summarize, visualize and/or
#' create new features of a dataset. Data can be projected in a linear or nonlinear
#' way, depending on the kernel used. When the kernel is `Linear()`, kernel PCA
#' is equivalent to ordinary PCA.
#'
#' @param K Kernel matrix (class "matrix").
#' @param center A logical value. If TRUE, the variables are zero-centered before
#' the PCA. (Defaults: TRUE).
#' @param Ktest (optional) An additional kernel matrix corresponding to test samples,
#' with dimension \emph{Ntest x Ntraining}. These new samples are projected
#' (using the color defined by `na_col`) over the kernel PCA computed from K.
#' Remember than the data that generated `Ktest` should be centered beforehand, using
#' the same values used for centering `K`.
#' @param plot (optional) A `ggplot2` is displayed. The input should be a vector of
#' integers with length 2, corresponding to the two Principal Components to be displayed in the plot.
#' @param y (optional) A factor, or a numeric vector, with length equal to `nrow(K)`
#' (number of samples). This parameter allows to paint the points with different colors.
#' @param colors A single color, or a vector of colors. If `y` is numeric, a gradient of colors
#' between the first and the second entry will be used to paint the points. (Defaults: "black").
#' @param na_col Color of the entries that have a NA in the parameter `y`, or the entries
#' corresponding to `Ktest` (when `Ktest` is not NULL). Otherwise, this parameter is ignored.
#' @param title Plot title.
#' @param pos_leg Position of the legend.
#' @param name_leg Title of the legend. (Defaults: blank)
#' @param labels (optional) A vector of the same length than nrow(K). A name will be
#' displayed next to each point.
#' @param ellipse (optional) A float between 0 and 1. An ellipse will be drawn for
#' each group of points defined by `y`. Here `y` should be of class "factor." This
#' parameter will indicate the spread of the ellipse.
#' @return A list with two objects:
#'
#' * The PCA projection (class "matrix"). Please note that if K was computed from a \emph{NxD}
#' table with \emph{N > D}, only the first \emph{N-D} PCs may be useful.
#'
#' * (optional) A `ggplot2` plot of the selected PCs.
#' @export
#' @import ggplot2
#' @importFrom methods hasArg
#' @importFrom stats qchisq
#' @importFrom methods is
#' @examples
#' dat <- matrix(rnorm(150),ncol=50,nrow=30)
#' K <- Linear(dat)
#'
#' ## Projection's coordinates only:
#' pca <- kPCA(K)
#'
#' ## Coordinates + plot of the two first principal components (PC1 and PC2):
#' pca <- kPCA(K,plot=1:2, colors = "coral2")
#' pca$plot
kPCA <- function(K, center=TRUE, Ktest=NULL, plot=NULL, y=NULL, colors="black", na_col="grey70",
title="Kernel PCA", pos_leg="right",name_leg="",labels=NULL,ellipse=NULL) {
#kPCA
if(center) K <- centerK(K)
spectral_K <- svd(K)
# La projecció és UD
spectral_K$d <- sqrt(spectral_K$d)
projected <- as.data.frame(spectral_K$u %*% diag(spectral_K$d))
rownames(projected) <- rownames(K)
colnames(projected) <- paste0("PC",1:ncol(projected))
#Projecció d'un test (si n'hi ha)
if(hasArg(Ktest)) {
# La predicció d'un nou punt és kUD^(-1) on k=k(x,x_i)
projectest <- Ktest %*% spectral_K$u %*% diag(1/spectral_K$d)
colnames(projectest) <- colnames(projected)
projected <- rbind(projected,projectest)
rownames(projected) <- c(rownames(K),rownames(Ktest))
if(is.null(y)) {
y <- rep(1,nrow(K))
pos_leg <- "none"
}
y2 <- c(y,rep(NA,nrow(Ktest)))
names(y2) <- rownames(projected)
if(methods::is(y,"factor")) y <- factor(y2,levels=levels(factor(y2)),labels=levels(y))
}
#Plot
if(hasArg(plot)) {
i <- plot[1]
j <- plot[2]
spectral_K$d <- spectral_K$d^2 ##calculat com a prcomp
xpercent <- spectral_K$d[i]/sum(spectral_K$d)*100
ypercent <- spectral_K$d[j]/sum(spectral_K$d)*100
q <- ggplot2::ggplot(projected, aes(projected[,i], projected[,j])) + theme_bw() +
ggtitle(title) + xlab(paste0("PC ", i,": ", format(xpercent,digits=3), "%")) +
ylab(paste0("PC ", j,": ", format(ypercent,digits=3), "%")) +
# coord_fixed() +
ggplot2::theme(legend.position = pos_leg,legend.box = "horizontal",plot.title = element_text(hjust = 0.5,face = "bold"))
if(hasArg(y)) {
q <- q + ggplot2::geom_point(ggplot2::aes(colour = y))
if(hasArg(colors)) {
if(methods::is(y,"factor")) {
q <- q + ggplot2::scale_color_manual(name=name_leg,values = colors,na.value = na_col)
} else {
q <- q + ggplot2::scale_colour_gradientn(name=name_leg, colours=colors, na.value = na_col)
}
} else {
q$labels$colour <- name_leg
}
} else if(hasArg(colors)) {
q <- q + ggplot2::geom_point(colour = colors)
} else {
q <- q + ggplot2::geom_point()
}
if(hasArg(labels)) q <- q + ggplot2::geom_text(ggplot2::aes(label=labels,hjust=0.5, vjust=2))
if((is.factor(y)|hasArg(Ktest)) && hasArg(ellipse)) {
if(hasArg(Ktest)) {
y[!is.na(y)] <- "Training"
y[is.na(y)] <- "Test"
y <- as.factor(y)
}
ell <- tapply(data.frame(xvar=projected[,i],yvar=projected[,j],groups=y),y, function(X)ellipse_helper(X,ellipse=ellipse))
ell <- data.frame(do.call(rbind, ell))
colnames(ell)[1:3] <- c("xvar", "yvar","groups")
xvar <- yvar <- groups <- NULL
q <- q + geom_path(data = ell, aes(xvar,yvar,group=groups,color=groups))
# + coord_fixed()
}
return(list(projection=projected,plot=q))
}
return(projected)
}
#' Contributions of the variables to the Principal Components ("loadings")
#'
#' @description
#'`kPCA_imp()` performs a PCA and a kernel PCA simultaneously and returns
#' the contributions of the variables to the Principal Components (sometimes, these
#' contributions are called "loadings") in Feature Space. Optionally, it can also
#' return the samples' projection (cropped to the relevant PCs) and the values used
#' to centering the variables in Feature Space.
#' It does not return any plot, nor it projects test data. To do so, please use `kPCA()`.
#' @details This function may be not valid for all kernels. Do not use it with
#' the RBF, Laplacian, Bray-Curtis, Jaccard/Ruzicka, or Kendall's tau kernels unless
#' you know exactly what you are doing.
#' @param DATA A matrix or data.frame (NOT a kernel matrix) containing the data in
#' feature space. Please note that nrow(DATA) should be higher than ncol(DATA).
#' If the Linear kernel is used, this feature space is simply the original space.
#' @param center A logical value. If TRUE, the variables are zero-centered. (Defaults: TRUE).
#' @param projected (optional) If desired, the PCA projection (generated, for example, by `kPCA()`)
#' can be included. If DATA is big (especially in the number of rows) this may save
#' some computation time.
#' @param secure (optional) If TRUE, it tests the quality of the loadings
#' This may be slow. (Defaults: FALSE).
#' @return A list with three objects:
#'
#' * The PCA projection (class "matrix") using only the relevant Principal Components.
#'
#' * The loadings.
#'
#' * The values used to center each variable in Feature Space.
#' @export
#' @import ggplot2
#' @examples
#' dat <- matrix(rnorm(150),ncol=30,nrow=50)
#' contributions <- kPCA_imp(dat)
#' contributions$loadings[c("PC1","PC2"),1:5]
kPCA_imp <- function(DATA, center=TRUE, projected=NULL,secure=FALSE) {
message("Do not use this function if the PCA was created with the RBF,
Laplacian, Bray-Curtis, Jaccard/Ruzicka, or Kendall's tau kernels")
n <- nrow(DATA)
d <- ncol(DATA)
if(n < d) stop("nrow should be greater than ncol")
# Centrar les dades
if(center) {
X <- scale(DATA,center = TRUE,scale=FALSE)
} else {
X <- DATA
}
# Importàncies
spectral_X <- svd(X)
loading <- spectral_X$v
rownames(loading) <- colnames(DATA) ##equivalent a rownames(loadings( pca))
colnames(loading) <- paste0("PC",1:d) ##equivalent a colnames(loadings( pca))
loading <- t(loading)
#kPCA
if(is.null(projected)) {
projected <- kPCA(K=Linear(X,cos.norm = FALSE),center=center)
} else {
if(ncol(projected)<ncol(X)) message("Ignoring projected data (ncol of projected < ncol of data)")
}
# Comprovació de signes
relneg <- as.data.frame(projected[,1:d] / spectral_X$u %*% diag(spectral_X$d))
## Alguns PC tenen el signe canviat (estan a l'inrevés). Així que canviam es signe dels loadings:
relneg <- which(round(colSums(relneg)) ==-n)
loading[as.numeric(relneg),] <- -loading[as.numeric(relneg),]
if(secure) { ## Mirem que la reconstrucció sigui correcta. Aquest pas pot ser lent
oldvars <- matrix(0,nrow=nrow(DATA),ncol=ncol(loading))
rownames(oldvars) <- rownames(DATA)
colnames(oldvars) <- colnames(loading)
for(i in 1:n) oldvars[i,] <- solve(loading,t(projected)[1:d,i])
oldvars <- oldvars + matrix(attr(X,which = "scaled:center"),nrow=n,ncol=ncol(oldvars),byrow = TRUE)
if(mean(matrix(abs(DATA-oldvars)))>1e-7) warning("Reconstruction may be inacurate")
}
return(list(rotated=projected[,1:d], loadings=loading, center=attr(X,which = "scaled:center")))
}
#' Plot the original variables' contribution to a PCA plot
#'
#' @description
#'`kPCA_arrows()` draws arrows on a (kernel) PCA plot to represent the contribution
#' of the original variables to the two displayed Principal Components (PCs).
#'
#' @details It is important to note that the arrows are scaled to match the samples' projection
#' plot. Thus, arrows' directions are correct, but do not expect that their magnitudes
#' match the output of `kPCA_imp()` or other functions(`prcomp`, `princomp...`).
#' (Nevertheless, they should at least be proportional to the real magnitudes.)
#' @param plot A kernel PCA plot generated by `kPCA()`.
#' @param contributions The variables contributions, for instance obtained via `kPCA_imp()`.
#' It is not mandatory to draw all the original variables; a subset of interest
#' can be passed on to this argument.
#' @param colour Color of arrows and labels. (Defaults: "steelblue").
#' @param size Size of the labels. (Defaults: 4).
#' @param ... Additional parameters passed on to geom_segments() and geom_text().
#' @return The PCA plot with the arrows (`ggplot2` object).
#' @export
#' @import ggplot2
#' @examples
#' dat <- matrix(rnorm(500),ncol=10,nrow=50)
#' K <- Linear(dat)
#'
#' ## Computing the kernel PCA. The plot represents PC1 and PC2:
#' kpca <- kPCA(K,plot=1:2)
#'
#' ## Computing the contributions to all the PCS:
#' pcs <- kPCA_imp(dat,secure=FALSE)
#'
#' ## We will draw the arrows for PC1 and PC2.
#' contributions <- t(pcs$loadings[1:2,])
#' rownames(contributions) <- 1:10
#' kPCA_arrows(plot=kpca$plot,contributions=contributions)
kPCA_arrows <- function(plot, contributions, colour="steelblue", size=4, ...) {
pcs <- plot$data[,as.numeric(sub("PC","",colnames(contributions)))]
ratio <- max(apply(contributions,2,range)/apply(pcs,2,range))
X <- Y <- label <- NULL
colnames(contributions)[1:2] <- c("X","Y")
if(!("label" %in% colnames(contributions))) {
contributions <- data.frame(contributions/ratio,label=rownames(contributions))
}
plot <- plot + ggplot2::geom_segment(data = contributions, aes(x=0, xend=X, y=0, yend=Y),
arrow = arrow(length = unit(0.3, "cm")),colour=colour)
# Add text
plot <- plot + ggplot2::geom_text(data=contributions, aes( x=X, y=Y, label=label), ,
size=size,hjust=0.5, vjust=2,colour=colour )
return(plot)
}
#' Procrustes Analysis
#'
#' Procrustes Analysis compares two PCA/PCoA/MDS/other ordination methods'
#' projections after "removing" the translation, scaling and rotation effects.
#' Thus, they are compared in their configuration of "maximum similarity".
#' Samples in the two projections should be related. The similarity of the projections
#' X1 and X2 is quantified using a correlation-like statistic derived from the
#' symmetric Procrustes sum of squared differences between X1 and X2.
#' @details `Procrustes()` performs a Procrustes Analysis equivalent to
#' `vegan::procrustes(X,Y,scale=FALSE,symmetric=TRUE)`.
#'
#' @param X1 A matrix or data.frame containing a PCA/PCoA/MDS projection.
#' @param X2 A second matrix or data.frame containing a different PCA/PCoA/MDS projection,
#' with the same number of rows than X1.
#' @param labels (optional) A vector of the same length than nrow(X1), or instead,
#' nrow(X1)+nrow(X2). A name will be displayed next to each point.
#' @inheritParams kPCA
#' @return A list containing:
#'
#' * X1 (zero-centered and scaled).
#'
#' * X2 superimposed over X1 (after translating, scaling and rotating X2).
#'
#' * Procrustes correlation between X1 and X2.
#'
#' * (optional) A `ggplot2` plot.
#'
#' @export
#' @import ggplot2
#' @importFrom methods hasArg
#' @examples
#' data1 <- matrix(rnorm(900),ncol=30,nrow=30)
#' data2 <- matrix(rnorm(900),ncol=30,nrow=30)
#' pca1 <- kPCA(Linear(data1),center=TRUE)
#' pca2 <- kPCA(Linear(data2),center=TRUE)
#' procr <- Procrustes(pca1,pca2)
#' # Procrustean correlation between pca1 and pca2:
#' procr$pro.cor
#' # With plot (first two axes):
#' procr <- Procrustes(pca1,pca2,plot=1:2,labels=1:30)
#' procr$plot
Procrustes <- function(X1,X2,plot=NULL,labels=NULL) { ## simètric
if (nrow(X1) != nrow(X2))
stop(gettextf("matrices have different number of rows: %d and %d",
nrow(X1), nrow(X2)))
if (ncol(X1) < ncol(X2)) {
warning("X1 has fewer axes than X2: X1 adjusted to comform X2\n")
addcols <- ncol(X2) - ncol(X1)
for (i in 1:addcols) X1 <- cbind(X1, 0)
} else if(ncol(X1) > ncol(X2)) {
warning("X2 has fewer axes than X1\n")
}
colnms <- colnames(X1)
## Center
X1 <- scale(X1,scale=FALSE)
X2 <- scale(X2,scale=FALSE)
## Scale
X1 <- frobNorm(as.matrix(X1))
X2 <- frobNorm(as.matrix(X2))
# Rotate
desc <- svd(crossprod(X1,X2))
Q <- tcrossprod(desc$v, desc$u)
X2_rot <- X2%*%Q
colnames(X2_rot) <- colnms
rownames(X2_rot) <- rownames(X1)
if(hasArg(plot)) {
i <- plot[1]
j <- plot[2]
colors <- factor(paste("Table",c(rep(1,nrow(X1)),rep(2,nrow(X2)))))
X <- data.frame(rbind(X1,X2_rot))
q <- ggplot(X, aes(X[,i], X[,j])) + theme_bw() +
ggtitle("Procrustes") + xlab(paste0("Dimension", i)) +
ylab(paste0("Dimension", j)) + coord_fixed() + ggplot2::geom_point(ggplot2::aes(colour = colors))
if(hasArg(labels)) {
if(length(labels)==nrow(X1)) {
labels <- c(labels,labels)
} else if(length(labels)!=(2*nrow(X1))) {
stop("label length does not match the dimension of X,Y")
}
q <- q + geom_text(ggplot2::aes(label=labels,hjust=0, vjust=0))
}
return(list(X1=X1, X2_rot=X2_rot, pro.cor= sqrt(1-tr( crossprod( X1-(X2_rot),X1-(X2_rot) ) )) ,plot=q))
} else {
return(list(X1=X1, X2_rot=X2_rot, pro.cor= sqrt(1-(tr(crossprod( X1-(X2_rot),X1-(X2_rot)))) )))
}
}
## Helpers
# Ellipse helper kPCA()
#' @keywords internal
#' @noRd
ellipse_helper <- function(X,ellipse) {
if (nrow(X) <= 2) return(NULL)
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- cbind(cos(theta), sin(theta))
sigma <- var(cbind(X$xvar, X$yvar))
mu <- c(mean(X$xvar), mean(X$yvar))
ed <- sqrt(stats::qchisq(ellipse, df = 2))
data.frame(sweep(circle %*% chol(sigma) * ed, 2,
mu, FUN = "+"), groups =X$groups[1])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.