#' @title mean.shift
#'
#' @name mean.shift
#'
#' @param vec numeric vector passed
#'
#' @return returns data mean centered
mean.shift <- function(vec) {
x.bar <- mean(vec)
vec.return <- vec - x.bar
return(vec.return)
}
#' @title mean.shift.inv
#'
#' @name mean.shift.inv
#'
#' @param vec numeric vector passed
#'
#' @return returns data invereted mean centered
mean.shift.inv <- function(vec) {
x.bar <- mean(vec)
vec.return <- x.bar - vec
return(vec.return)
}
#' @title just.pca
#'
#' @description Takes Tidy data and Generates pca.object that retains
#' information of original data.
#'
#' @name just.pca
#'
#' @param tidydata Tidy data input
#' @param y column name(s) or index(es) of columns whose interaction is used
#' as y component of formula in dcast's formula = y~x. y or the interaction
#' of columns passed to y becomes the 'individuals' of the pca. This interaction
#' forms an 'id.col' that is added to the original data frame and used to join
#' pca components and the original data frame.
#' @param x column name(s) or index(es) of columns whose interaction is used
#' as y component of formula in dcast's formula = y~x. x or the interaction
#' of columns passed to x becomes the 'variables' of the pca.
#' @param value.var Character string of name of column which stores values.
#' @param scale Logical, scales the variables to have the same unit variance.
#' Default set to FALSE.
#' @param num.degrees Integer that determines the number of principal components
#' to join to the orginal data frame. Default set to 5.
#' @param invert Logical, determines if mean centering should be inverted or not.
#' Default set to FALSE.
#'
#' @return returns pca object that is a list of "data","loadings","values",
#' "covariance","contribution"
#'
#' @export
just.pca <- function(tidydata, y,x,value.var,scale = FALSE, num.degrees = 5, invert = FALSE) {
#Most of this code is recreating popular PCA functions to understand what is happening under the hood
tidycoln <- colnames(tidydata)
y.index <- index.o.coln(vec = y, v.size = length(y), v.name = "y", name.col = tidycoln)
x.index <- index.o.coln(vec = x, v.size = length(x), v.name = "x", name.col = tidycoln)
val.index <- index.o.coln(vec = value.var, v.size = 1, v.name = "value.var", name.col = tidycoln)
if(length(y.index)==1) {
colnames(tidydata)[y.index] <- "id.col"
} else {
tidydata$id.col <- interaction(tidydata[,y.index])
y.index <- ncol(tidydata)
}
if(length(x.index)==1) {
colnames(tidydata)[x.index] <- "variable.col"
} else {
tidydata$variable.col <- interaction(tidydata[,x.index])
x.index <- ncol(tidydata)
}
tidydata <- as.data.frame(tidydata)
tidycoln <- colnames(tidydata)
tidy.input <- unique(tidydata[,c(y.index,x.index, val.index)])
matrix <- dcast(data = tidy.input,formula = id.col~variable.col, value.var = value.var )
tidydata <- tidydata[,-c(x.index, val.index)]
tidydata <- as.data.frame(unique(tidydata))
colnames(tidydata) <- tidycoln[-c(x.index,val.index)]
#other.index <- index.o.coln(vec = Column.ex, v.size = length(Column.ex), v.name = "Column.ex",name.col = colnames(matrix))
#other.columns <- tidydata[,]
#matrix.df <- tidydata[,-(index)]
if(length(unique(rownames(matrix)))!=nrow(matrix)){
stop("row names are not unique!")
}
rownames(matrix) <- matrix$id.col
matrix <- matrix[,-1]
if(invert==T) {
matrix.df.work <- apply(X = matrix, MARGIN = 2,FUN = mean.shift.inv)
} else{
matrix.df.work <- apply(X = matrix, MARGIN = 2,FUN = mean.shift)
}
#testing scale
if(scale==TRUE){
rnames <- rownames(matrix.df.work)
matrix.df.work <- apply(X = matrix.df.work, MARGIN = 2, FUN = scale)
rownames(matrix.df.work) <- rnames
}
#covariance
my.cov = cov(matrix.df.work)
points.names <- rownames(my.cov)
#eigen values
my.eig = eigen(my.cov)
n <- ncol(my.eig[["vectors"]])
PC <- vector(mode = "character", n)
for(i in 1:n) {
PC[i] <- paste("PC",i,sep = "")
}
colnames(my.eig$vectors) <- PC
rownames(my.eig$vectors) <- points.names
loadings <- my.eig$vectors
values <- my.eig$values
new <- matrix.df.work%*%loadings
new <- new[,-c((num.degrees+1):n)]
new <- as.data.frame(new) %>%
mutate(id.col=rownames(new))
df <- inner_join(tidydata, new, by = c("id.col"))
contrib <- loadings*loadings
object <- list(df, loadings, values, my.cov, contrib)
names(object) <- c("data","loadings","values","covariance","contribution")
return(object)
}
#' @title plotContrib
#'
#' @description plots the contribution of variables in pca in ggplot
#'
#' @name plotContrib
#'
#' @param pca.obj Takes pca object made by just.pca()
#' @param PC Integer input of which PC dimension should be plotted
#' @param first Integer input. Returns the first 10 most contributing
#' variables by default.
#'
#' @return ggplot bar plot of the the most contributing variables to
#' a PC dimension.
#'
#' @export
plotContrib <- function(pca.obj, PC, first = 10) {
contrib <- pca.obj$contribution
contrib <- as.data.frame(contrib[,PC])
colnames(contrib) <- "Contribution"
contrib$names <- rownames(contrib)
contrib <- contrib[order(contrib$Contribution),]
order.names <- contrib[order(contrib$Contribution, decreasing = T),]$names
contrib$names <- factor(contrib$names, levels = order.names)
order.names <- order.names[c(1:first)]
contrib <- contrib[contrib$names %in% order.names,]
contrib[,1] <- contrib[,1]*100
# contrib <- count.to.frequency(df = contrib, count = 1)
#contrib <- droplevels(contrib)
p <- ggplot(contrib, aes(x = names, y = Contribution)) +
geom_bar(stat = "identity", fill = "blue") +
labs(list(y = "Contribution (%)", x = "Variables")) +
theme_classic() +
theme(axis.text.x = element_text(angle = -90, hjust =0, vjust = .5, size = 8))
return(p)
}
#' @title neg.square
#'
#' @name neg.square
#'
#' @param x numeric vector passed
#'
#' @return returns the squares of x, but retains negative sign
neg.square <- function(x) {
neg <- ifelse(x<0, -1, 1)
x <- x*x
x <- x*neg
return(x)
}
#' @title scale.abs.max
#'
#' @name scale.abs.max
#'
#' @param x numeric vector passed
#'
#' @return returns x scaled by the absolute max
scale.abs.max <- function(x) {
m <- max(abs(x))
x <- x/(m)
return(x)
}
#' @title circleFun
#'
#' @name circleFun
#'
#' @param center coordinate points for the center of circle
#' @param diameter diameter length of circle, 2r
#' @param npoints number of points to estimate
#'
#' @source https://stackoverflow.com/questions/6862742/draw-a-circle-with-ggplot2/6863490#6863490 from Joran
#'
#' @return returns data frame to draw circle points.
circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
#' @title to.unit
#'
#' @description Takes cartesian coordiants and returns the unit circle
#' points as x, y coordinates.
#'
#' @name to.unit
#'
#' @param x numeric vector passed of length 2
#'
#' @return returns x y cordinates in polar form
#'
#' @export
to.unit <- function(x) {
neg <- FALSE
if(length(x)!=2) {
stop()
}
if(x[1]<0 && x[1]!=0) {
neg <- TRUE
}
if(sum(abs(x))==0) {
x <- c(0,0)
return(x)
} else {
x.hat <- cos((atan(x[2]/x[1])))
y.hat <- sin((atan(x[2]/x[1])))
x[1] <- x.hat
x[2] <- y.hat
if(neg==TRUE) {
x <- x*(-1)
}
return(x)
}
}
#' @title pca.biplot
#'
#' @description Returns biplot of two pc dimensions
#'
#' @name pca.biplot
#'
#' @param pca.obj Takes pca object made by just.pca()
#' @param pcs Integer input vector of length 2 of which PC dimensions
#' should be plotted
#' @param flip Logical, Default set to FALSE. will flip coordinates
#' over x=y line. Only included because PCA function's biplot seems to
#' always be flipped.
#' @param scale Logical, Default set to TRUE. Will scale vectors to unit
#' circle. If set to FALSE, will use eigenvalues as relative length. Scale
#' set to FALSE will also remove regular coordinates and plotted circle.
#'
#' @return ggplot plot of the variables in a coordinate graph of two
#' pc dimensions.
#'
#' @export
pca.biplot <- function(pca.obj, pcs, flip = FALSE, scale = T) {
index <- index.o.coln(vec = pcs, v.size = 2, v.name = "pcs",name.col = colnames(pca.objec$loadings))
weights <- pca.obj$loadings[,index]
unit.dir <- t(apply(weights, MARGIN = 1, to.unit))
weights[,1] <- weights[,1]*sqrt(pca.obj$values[index[1]])
weights[,2] <- weights[,2]*sqrt(pca.obj$values[index[2]])
sq <- sqrt((weights[,1]^2)+(weights[,2]^2))
percent.var <- round((pca.obj$values/(sum(pca.obj$values)))*100,digits = 1)
#squares <- neg.square(weights)
if(scale==T) {
scale.sq <- sq/(max(sq))
#weights <- apply(weights, MARGIN = 2, scale.abs.max)
weights <- unit.dir*scale.sq
} else {
# unit.dir[,1] <- unit.dir[,1]*sqrt(pca.obj$values[index[1]])
# unit.dir[,2] <- unit.dir[,2]*sqrt(pca.obj$values[index[2]])
weights <- unit.dir*sq
}
col.temp <- colnames(weights)
colnames(weights) <- c("V1","V2")
weights <- as.data.frame(weights)
if(flip==TRUE){
weights <- weights*-1
}
weights$names <- rownames(weights)
#library(ggrepel)
p <- ggplot(weights, aes(x = V1, y = V2,label=names)) +
geom_point(position = position_jitter()) +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_segment(mapping=aes(x=0,
y=0,
xend=V1,
yend=V2),
arrow=arrow(angle = 10,
length = unit(.05,
units = "npc")),
size=.1, color="blue")
if(scale==T) {
p <- p + coord_cartesian(xlim = c(-1,1),
ylim = c(-1,1)) +
geom_polygon(aes(x, y),
data = circleFun(center = c(0,0),
diameter = 2,
npoints = 100),
colour = "black",
fill = NA,
inherit.aes = F)
}
p <- p +
geom_label_repel()+
theme_classic() +
labs(list(x = paste(col.temp[1],
" (",
percent.var[index[1]],
" %)",
sep = ""),
y = paste(col.temp[2],
" (",
percent.var[index[2]],
" %)",
sep = "")))
return(p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.