#' integrated do_svd
#' @export
do_svd <- function(df, ..., skv = NULL, fun.aggregate=mean, fill=0){
validate_empty_data(df)
if (!is.null(skv)) {
#.kv pattern
if (!length(skv) %in% c(2, 3)) {
stop("length of skv has to be 2 or 3")
}
value <- if(length(skv) == 2) NULL else skv[[3]]
do_svd.kv_(df, skv[[1]], skv[[2]], value, fun.aggregate = fun.aggregate, fill = fill, ...)
} else {
#.cols pattern
do_svd.cols(df, ...)
}
}
#' Non Standard Evaluation version of do_svd.kv_
#' @export
do_svd.kv <- function(df, subject, key, value = NULL, ...){
subject_col <- col_name(substitute(subject))
key_col <- col_name(substitute(key))
if(!is.null(substitute(value))){
value_col <- col_name(substitute(value))
} else {
value_col <- NULL
}
do_svd.kv_(df, subject_col, key_col, value_col, ...)
}
#' Calculate svd from tidy format. This can be used to calculate coordinations by reducing dimensionality.
#' @param df Data frame which has group and dimension
#' @param subject_col A column name to be regarded as groups
#' @param key_col A column name to be regarded as original dimensions
#' @param value_col A column name to be regarded as values
#' @param type "group" to see the coordinations of groups in reduced dimension.
#' "dimension" to see the direction of new axes in original dimension.
#' "variance" to see how much the data is distributed in the direction of new axes.
#' @param fill Value to fill where value doesn't exist.
#' @param fun.aggregate Function to aggregate values in duplicated pairs of subject and key.
#' @param n_component Number of dimensions to return.
#' @param centering Move the origin to center of data (Mean of each column)
#' Taken from http://genomicsclass.github.io/book/pages/pca_svd.html
#' @param output "long" or "wide".
#' "long" is a format with 3 columns which represents rows, columns and values
#' "wide" is a format which spreads the long information into matrix
#' @return Tidy format of svd result in a data frame.
#' @export
do_svd.kv_ <- function(df,
subject_col,
key_col,
value_col = NULL,
type="group",
fill=0,
fun.aggregate=mean,
n_component=3,
centering=TRUE,
output ="long"){
validate_empty_data(df)
loadNamespace("dplyr")
loadNamespace("tibble")
loadNamespace("tidyr")
dimension_col <- key_col
grouped_col <- grouped_by(df)
if(subject_col %in% grouped_col){
stop(paste0(subject_col, " is a grouping column. ungroup() may be necessary before this operation."))
}
axis_prefix <- "axis"
value_cname <- avoid_conflict(colnames(df), "value")
# this is executed on each group
do_svd_each <- function(df){
# value_col might be NULL
# when values in matrix
# should be count of key and value pairs,
# so it should be checked
if (!is.null(value_col)) {
# remove NA from the value column
df <- df[!is.na(df[[value_col]]), ]
}
matrix <- simple_cast(
df,
subject_col,
dimension_col,
value_col,
fun.aggregate = fun.aggregate,
fill=fill,
na.rm = TRUE
)
# n_component must be smaller than
# or equal to ncol(matrix).
# Otherwise, svd takes to much memory and R session
# crushes
n_component <- min(c(ncol(matrix), n_component))
# this might happen if fill argument is NA or fun.aggregate returns NA
if(any(is.na(matrix))){
stop("NA is not supported as value")
}
if(centering){
# Move the origin to center of data
# (Mean of each column)
#
# Explanation from http://genomicsclass.github.io/book/pages/pca_svd.html
# "The second argument specifies we want to operate on the columns
# (1 would be used for rows), and the third and fourth arguments
# specify that we want to subtract the column means."
matrix <- sweep(matrix, 2, colMeans(matrix), "-")
}
if(type=="group"){
# This caliculates coordinations of groups (rows of matrix)
# in reduced dimension
# and create columns with the information
# u matrix in svd can be regarded as coordinations of groups in reduced dimension
result <- svd(matrix, nu=n_component, nv=0)
mat <- result$u
if (output=="wide") {
# This returns a data frame
# whose columns are coordinations
# in new dimension.
ret <- as.data.frame(mat)
colnames(ret) <- avoid_conflict(c(grouped_col, subject_col), paste("axis", seq(ncol(mat)), sep=""))
# subject column might be non-character type,
# but rownames are always character,
# so its type should be reverted to the same type
# with subject column
rnames <- to_same_type(rownames(matrix), df[[subject_col]])
# create a column for subject values
df <- setNames(data.frame(rnames, stringsAsFactors = FALSE), subject_col)
ret <- cbind(df, ret)
} else if (output=="long") {
# this returns molten format data frame of matrix
# with subjects, new dimentions and values
cnames <- avoid_conflict(grouped_col, c(subject_col, "new.dimension", value_cname))
rownames(mat) <- rownames(matrix)
ret <- mat_to_df(mat, cnames)
} else {
stop(paste(output, "is not supported as output"))
}
} else if (type=="dimension") {
# This caliculates direction of
# base vectors in new dimension
# in original dimension,
# which is v in svd.
# v matrix in svd can be regarded as the direction of new axes in original dimension.
result <- svd(matrix, nv=n_component, nu=0)
mat <- result$v
rownames(mat) <- colnames(matrix)
if (output=="wide") {
# a column is a base vector of
# a new axis of reduced dimension
# in the original dimension
ret <- as.data.frame(mat)
colnames(ret) <- avoid_conflict(c(grouped_col, dimension_col), paste("axis", seq(ncol(mat)), sep=""))
rnames <- to_same_type(rownames(mat), df[[dimension_col]])
df <- setNames(data.frame(rnames, stringsAsFactors = FALSE), dimension_col)
ret <- cbind(df, ret)
} else if (output=="long") {
# this returns molten format data frame of matrix
# with key, new dimentions and values
cnames <- avoid_conflict(grouped_col, c(dimension_col, "new.dimension", value_cname))
ret <- mat_to_df(mat, cnames)
} else {
stop(paste(output, "is not supported as output"))
}
} else if (type=="variance"){
# This returns d in svd which can be
# regarded as how much the data is
# distributed in the direction of new axes.
variance <- svd(matrix, nu=0, nv=0)$d
component <- seq(min(length(variance), n_component))
if (output=="wide") {
# this return one row data frame with d of svd
mat <- matrix(variance[component], ncol=length(component))
ret <- as.data.frame(mat)
colnames(ret) <- avoid_conflict(c(grouped_col), paste("axis", seq(ncol(mat)), sep=""))
} else if (output=="long") {
# this return a data frame with a column with d of svd
ret <- data.frame(component = component, svd.value = variance[component])
colnames(ret) <- avoid_conflict(grouped_col, c("new.dimension", value_cname))
} else {
stop(paste(output, "is not supported as output"))
}
} else {
stop(paste(type, "is not supported as type argument."))
}
ret
}
# Calculation is executed in each group.
# Storing the result in this tmp_col and
# unnesting the result.
# If the original data frame is grouped by "tmp",
# overwriting it should be avoided,
# so avoid_conflict is used here.
tmp_col <- avoid_conflict(grouped_col, "tmp")
ret <- df %>%
dplyr::do_(.dots=setNames(list(~do_svd_each(.)), tmp_col)) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(tmp_col))
ret
}
#' Calculate distance of each pair of groups.
#' @param df data frame in tidy format
#' @param ... Arguments to select columns to calculate svd.
#' @param type "group" to see the coordinations of groups in reduced dimension.
#' "dimension" to see the direction of new axes in original dimension.
#' "variance" to see how much the data is distributed in the direction of new axes.
#' @param fill Value to fill where value doesn't exist.
#' @param fun.aggregate Function to aggregate values in duplicated pairs of subject and key.
#' @param n_component Number of dimensions to return.
#' @param centering Move the origin to center of data (Mean of each column)
#' Taken from http://genomicsclass.github.io/book/pages/pca_svd.html
#' @param output "long" or "wide".
#' "long" is a format with 3 columns which represents rows, columns and values
#' "wide" is a format which spreads the long information into matrix
#' @param keep_cols If TRUE, selected columns are kept in the result.
#' @return Tidy format of svd result in a data frame.
#' @export
do_svd.cols <- function(df,
...,
type="group",
n_component=2,
centering=TRUE,
output ="wide",
keep_cols = FALSE){
validate_empty_data(df)
loadNamespace("dplyr")
loadNamespace("tidyr")
loadNamespace("reshape2")
loadNamespace("stats")
loadNamespace("lazyeval")
grouped_col <- grouped_by(df)
label_col <- col_name(substitute(label))
# using the new way of NSE column selection evaluation
# ref: https://github.com/tidyverse/tidyr/blob/3b0f946d507f53afb86ea625149bbee3a00c83f6/R/spread.R
select_dots <- tidyselect::vars_select(names(df), !!! rlang::quos(...))
value_colname <- avoid_conflict(grouped_col, "value")
# this is executed on each group
do_svd_each <- function(df){
# create matrix by selected columns
selected_df <- df %>%
dplyr::select(!!!select_dots)
matrix <- selected_df %>%
as.matrix() %>%
na.omit() # this removes rows which have any NA
# n_component must be smaller than
# or equal to ncol(matrix).
# Otherwise, svd takes too much memory and R session
# crushes
n_component <- min(c(ncol(matrix), n_component))
if(centering){
# Move the origin to center of data
# (Mean of each column)
#
# Explanation from http://genomicsclass.github.io/book/pages/pca_svd.html
# "The second argument specifies we want to operate on the columns
# (1 would be used for rows), and the third and fourth arguments
# specify that we want to subtract the column means."
matrix <- sweep(matrix, 2, colMeans(matrix), "-")
}
if(type=="group"){
# This caliculates coordinations of groups (rows of matrix)
# in reduced dimension
# and create columns with the information
# u matrix in svd can be regarded as coordinations of groups in reduced dimension
result <- svd(matrix, nu=n_component, nv=0)
mat <- result$u
if (output=="wide") {
# This returns a data frame
# whose columns are coordinations
# in new dimension
# while keeping columns
# from the original data frame.
# get row indice that had NA
na_indice <- na.action(matrix)
# Rows which have any NA value are removed from matrix to calculate svd
# but the result will be binded to original data frame
# so rows with all NA should be inserted to the removed rows
mat <- fill_mat_NA(setdiff(seq(nrow(df)), na_indice), mat = mat, max_index = nrow(df))
ret <- as.data.frame(mat)
colnames(ret) <- avoid_conflict(c(colnames(df), grouped_col), paste0("axis", seq(ncol(ret))))
# remove selected columns
not_selected_df <- df[,!colnames(df) %in% colnames(selected_df)]
# append u matrix to the original data frame
ret <- if(keep_cols){
# keep selected columns in the end
# if keep_cols is TRUE
cbind(not_selected_df, ret, selected_df)
} else {
cbind(not_selected_df, ret)
}
} else if (output=="long") {
# this returns molten format data frame of matrix
# with subjects, new dimentions and values
cnames <- avoid_conflict(grouped_col, c("row", "new.dimension", value_colname))
ret <- mat_to_df(mat, cnames)
} else {
stop(paste(output, "is not supported as output"))
}
} else if (type=="dimension") {
# This caliculates direction of
# base vectors in new dimension
# in original dimension,
# which is v in svd.
# v matrix in svd can be regarded as the direction of new axes in original dimension.
result <- svd(matrix, nv=n_component, nu=0)
mat <- result$v
rownames(mat) <- colnames(matrix)
if (output=="wide") {
# This returns a data frame
# whose columns are base vectors of
# new axes of reduced dimension
# in the original dimension
ret <- as.data.frame(mat)
colnames(ret) <- avoid_conflict(c(grouped_col), paste("axis", seq(ncol(mat)), sep=""))
rnames <- colnames(matrix)
df <- setNames(data.frame(rnames, stringsAsFactors = FALSE), "colname")
ret <- cbind(df, ret)
} else if (output=="long") {
# this returns molten format data frame
# of v matrix
# with key, new dimentions and values
cnames <- avoid_conflict(grouped_col, c("colname", "new.dimension", value_colname))
ret <- mat_to_df(mat, cnames)
} else {
stop(paste(output, "is not supported as output"))
}
} else if (type=="variance"){
# This returns d in svd which can be
# regarded as how much the data is
# distributed in the direction of new axes.
variance <- svd(matrix, nu=0, nv=0)$d
component <- seq(min(length(variance), n_component))
if (output=="wide") {
# this return one row data frame with d of svd
mat <- matrix(variance[component], ncol=length(component))
ret <- as.data.frame(mat)
colnames(ret) <- avoid_conflict(grouped_col, paste("axis", seq(ncol(mat)), sep=""))
} else if (output=="long") {
# this return a data frame with a column with d of svd
ret <- data.frame(component = component, svd.value = variance[component])
colnames(ret) <- avoid_conflict(grouped_col, c("new.dimension", value_colname))
} else {
stop(paste(output, "is not supported as output"))
}
} else {
stop(paste(type, "is not supported as type argument."))
}
ret
}
# Calculation is executed in each group.
# Storing the result in this tmp_col and
# unnesting the result.
# If the original data frame is grouped by "tmp",
# overwriting it should be avoided,
# so avoid_conflict is used here.
tmp_col <- avoid_conflict(grouped_col, "tmp")
ret <- df %>%
dplyr::do_(.dots=setNames(list(~do_svd_each(.)), tmp_col)) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(tmp_col))
ret
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.