#'
#'
#' scale wrapper that returns a vector as a result
#' @export
normalize <- function(x, center = TRUE, scale = TRUE) {
if (scale && (center || (x[1] == 0)) &&
min(x, na.rm = TRUE) == max(x, na.rm = TRUE)) {
# If input is constant and scaling is on, and centering is on or values in the first place were 0,
# return vector of zeros.
# scale() does not behave well here, like returning NaN or numbers very close to 0.
# We handle this case ourselves.
rep(0, length(x))
}
else {
ret <- scale(x, center = center, scale = scale)
# scale returns a matrix even if the input is a vector.
# Convert from a matrix to a numeric vector.
as.numeric(ret)
}
}
#' integrated do_cor
#' @export
do_cor <- function(df, ..., skv = NULL, fun.aggregate=mean, fill=0){
validate_empty_data(df)
if (!is.null(skv)) {
#.kv pattern
if (!(length(skv) %in% c(2, 3))) {
stop("length of skv has to be 2 or 3")
}
value <- if(length(skv) == 2) NULL else skv[[3]]
do_cor.kv_(df, skv[[1]], skv[[2]], value, fun.aggregate = fun.aggregate, fill = fill, ...)
} else {
#.cols pattern
do_cor.cols(df, ...)
}
}
#'
#' Calculate correlation among groups and output the correlation of each pair
#' @param df data frame in tidy format
#' @param group A column you want to calculate the correlations for.
#' @param dimension A column you want to use as a dimension to calculate the correlations.
#' @param value A column for the values you want to use to calculate the correlations.
#' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs"
#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman".
#' @param fun.aggregate Set an aggregate function when there are multiple entries for the key column per each category.
#' @param time_unit Unit of time to aggregate key_col if key_col is Date or POSIXct. NULL doesn't aggregate.
#' @return correlations between pairs of groups
#' @export
do_cor.kv <- function(df,
subject,
key,
value = NULL,
...)
{
loadNamespace("reshape2")
loadNamespace("dplyr")
loadNamespace("tidyr")
loadNamespace("lazyeval")
row <- col_name(substitute(key))
col <- col_name(substitute(subject))
val <- if(is.null(substitute(value))) NULL else col_name(substitute(value))
do_cor.kv_(df, col, row, val, ...)
}
#' SE version of do_cor.kv
#' @export
do_cor.kv_ <- function(df,
subject_col,
key_col,
value_col = NULL,
time_unit = NULL,
use="pairwise.complete.obs",
method="pearson",
distinct = FALSE,
diag = FALSE,
fill = 0,
fun.aggregate = mean,
return_type = "data.frame"
)
{
validate_empty_data(df)
loadNamespace("reshape2")
loadNamespace("dplyr")
loadNamespace("tidyr")
loadNamespace("lazyeval")
row <- key_col
col <- subject_col
val <- value_col
grouped_col <- grouped_by(df)
if(col %in% grouped_col){
stop(paste0(col, " is a grouping column. ungroup() may be necessary before this operation."))
}
# column names are "{subject}.x", "{subject}.y", "value"
output_cols <- avoid_conflict(grouped_col,
c("pair.name.x", "pair.name.y",
"correlation", "p_value", "statistic"))
do_cor_each <- function(df){
mat <- simple_cast(
df,
row,
col,
val,
fun.aggregate=fun.aggregate,
fill=fill,
time_unit = time_unit,
na.rm = TRUE
)
if (dim(mat)[[1]] < 2) {
# Correlation require 2 or more rows.
if (length(grouped_col) > 0) {
# Skip this group in this case. TODO: Report error.
return(NULL)
}
else {
# This is the only group. Throw error.
stop("More than 1 aggregated measures per category are required to calculate correlations.")
}
}
ret <- do_cor_internal(mat, use, method, diag, output_cols, na.rm=FALSE) # TODO: Why was na.rm explicitly set to TRUE for do_cor.kv_ but not for do_cor.cols?
# Set factor levels to pair.name.x and pair.name.y based on the mean of correlations with other columns.
cor0 <- ret %>% dplyr::filter(pair.name.x != pair.name.y)
cor0 <- cor0 %>% dplyr::group_by(pair.name.x) %>% dplyr::summarize(mean_cor=mean(correlation, na.rm=TRUE)) %>% dplyr::arrange(desc(mean_cor))
ret <- ret %>% dplyr::mutate(pair.name.x = forcats::fct_relevel(pair.name.x, cor0$pair.name.x), pair.name.y = forcats::fct_relevel(pair.name.y, cor0$pair.name.x))
if (distinct) {
ret <- ret %>% dplyr::filter(as.integer(pair.name.x) <= as.integer(pair.name.y))
}
if (return_type == "data.frame") {
# Sort the data for step output to look better organized on the table view.
ret <- ret %>% dplyr::arrange(pair.name.x, pair.name.y)
# Revert the variable names to character for step output.
ret <- ret %>% dplyr::mutate(pair.name.x = as.character(pair.name.x), pair.name.y = as.character(pair.name.y))
# We use paste0 since str_c garbles multibyte column names here for some reason.
ret <- ret %>% dplyr::rename(!!rlang::sym(paste0(col, ".x")):=pair.name.x, !!rlang::sym(paste0(col, ".y")):=pair.name.y)
ret # Return correlation data frame as is.
}
else {
# Return cor_exploratory model, which is a set of correlation data frame and the original data.
# We use the original data for scatter matrix on Analytics View.
# We use paste0 since str_c garbles multibyte column names here for some reason.
ret <- ret %>% dplyr::rename(!!rlang::sym(paste0(col, ".x")):=pair.name.x, !!rlang::sym(paste0(col, ".y")):=pair.name.y)
ret <- list(cor = ret, data = df)
class(ret) <- c("cor_exploratory", class(ret))
ret
}
}
# Calculation is executed in each group.
# Storing the result in this tmp_col and
# unnesting the result.
# If the original data frame is grouped by "tmp",
# overwriting it should be avoided,
# so avoid_conflict is used here.
if (return_type == "data.frame") {
tmp_col <- avoid_conflict(grouped_col, "tmp")
df %>%
dplyr::do_(.dots=setNames(list(~do_cor_each(.)), tmp_col)) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(tmp_col))
}
else {
do_on_each_group(df, do_cor_each, name = "model", with_unnest = FALSE)
}
}
#'
#' Calculate correlation among columns and output the correlation of each pair
#' @param df data frame in tidy format
#' @param ... Arguments to select columns to calculate correlation.
#' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs"
#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman".
#' @return correlations between pairs of columns
#' @export
do_cor.cols <- function(df, ..., use = "pairwise.complete.obs", method = "pearson",
distinct = FALSE, diag = FALSE, variable_order = "correlation",
return_type = "data.frame") {
validate_empty_data(df)
loadNamespace("dplyr")
loadNamespace("lazyeval")
loadNamespace("tibble")
# select columns using dplyr::select logic
# using the new way of NSE column selection evaluation
# ref: https://github.com/tidyverse/tidyr/blob/3b0f946d507f53afb86ea625149bbee3a00c83f6/R/spread.R
select_dots <- tidyselect::vars_select(names(df), !!! rlang::quos(...))
grouped_col <- grouped_by(df)
output_cols <- avoid_conflict(grouped_col, c("pair.name.x", "pair.name.y", "correlation", "p_value", "statistic"))
# check if the df's grouped
do_cor_each <- function(df){
if (nrow(df) < 2) {
# Correlation require 2 or more rows.
if (length(grouped_col) > 0) {
# Skip this group in this case. TODO: Report error.
return(NULL)
}
else {
# This is the only group. Throw error.
stop("More than 1 row are required to calculate correlations.")
}
}
mat <- dplyr::select(df, !!!select_dots) %>%
# Convert logical to numeric explicitly, since implicit conversion by as.matrix does not happen if all the columns are logical.
dplyr::mutate(across(where(is.logical), as.numeric)) %>%
as.matrix()
ret <- do_cor_internal(mat, use, method, diag, output_cols, na.rm=TRUE)
if (variable_order == "correlation") {
# Set factor levels to pair.name.x and pair.name.y based on the mean of correlations with other columns.
cor0 <- ret %>% dplyr::filter(pair.name.x != pair.name.y)
cor0 <- cor0 %>% dplyr::group_by(pair.name.x) %>% dplyr::summarize(mean_cor=mean(correlation, na.rm=TRUE)) %>% dplyr::arrange(desc(mean_cor))
ret <- ret %>% dplyr::mutate(pair.name.x = forcats::fct_relevel(pair.name.x, cor0$pair.name.x), pair.name.y = forcats::fct_relevel(pair.name.y, cor0$pair.name.x))
}
else { # "input" case. Honor the specified variable order.
ret <- ret %>% dplyr::mutate(pair.name.x = forcats::fct_relevel(pair.name.x, !!select_dots), pair.name.y = forcats::fct_relevel(pair.name.y, !!select_dots))
}
if (distinct) {
ret <- ret %>% dplyr::filter(as.integer(pair.name.x) <= as.integer(pair.name.y))
}
if (return_type == "data.frame") {
# Sort the data for step output to look better organized on the table view.
ret <- ret %>% dplyr::arrange(pair.name.x, pair.name.y)
# Revert the variable names to character for step output.
ret <- ret %>% dplyr::mutate(pair.name.x = as.character(pair.name.x), pair.name.y = as.character(pair.name.y))
ret # Return correlation data frame as is.
}
else {
# Return cor_exploratory model, which is a set of correlation data frame and the original data.
# We use the original data for scatter matrix on Analytics View.
ret <- list(cor = ret, data = df)
class(ret) <- c("cor_exploratory", class(ret))
ret
}
}
if (return_type == "data.frame") {
df %>%
dplyr::do_(.dots=setNames(list(~do_cor_each(.)), output_cols[[1]])) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(output_cols[[1]]))
}
else {
do_on_each_group(df, do_cor_each, name = "model", with_unnest = FALSE)
}
}
do_cor_internal <- function(mat, use, method, diag, output_cols, na.rm) {
# Sort the column name.
# Now that we sort the variables based on correlation result or input order, this might not be as meaningful,
# but I'm hoping this might still help align the result between Mac and Windows if there are ties in the mean correlations.
# We use stringr::str_sort() as opposed to base sort() so that the result is consistent on Windows too.
sorted_colnames <- stringr::str_sort(colnames(mat))
mat <- mat[,sorted_colnames]
cor_mat <- cor(mat, use = use, method = method)
ret <- mat_to_df(cor_mat,
cnames=output_cols[1:3],
diag=diag,
na.rm=na.rm,
zero.rm=FALSE)
# Create a matrix of P-values for Analytics View case.
dim <- length(sorted_colnames)
pvalue_mat <- matrix(NA, dim, dim)
tvalue_mat <- matrix(NA, dim, dim)
for (i in 2:dim) {
for (j in 1:(i-1)) {
tryCatch({
cor_test_res <- cor.test(mat[,i], mat[,j], method = method)
pvalue_mat[i, j] <- cor_test_res$p.value
tvalue_mat[i, j] <- cor_test_res$statistic
}, error = function(e) {
if (e$message == "not enough finite observations") {
# This is the error cor.test returns when there is not enough non-NA data.
# Rather than stopping, set NA as the result, and we will handle it as a not-significant case on the UI.
pvalue_mat[i, j] <- NA
tvalue_mat[i, j] <- NA
}
else {
stop(e)
}
})
pvalue_mat[j, i] <- pvalue_mat[i, j]
tvalue_mat[j, i] <- tvalue_mat[i, j]
}
}
for (i in 1:dim) { # For i=j case, P value should be always 0 and t statistic should be Inf.
pvalue_mat[i, i] <- 0
tvalue_mat[i, i] <- Inf
}
colnames(pvalue_mat) <- sorted_colnames
rownames(pvalue_mat) <- sorted_colnames
colnames(tvalue_mat) <- sorted_colnames
rownames(tvalue_mat) <- sorted_colnames
p_value_ret <- mat_to_df(pvalue_mat, cnames=output_cols[c(1,2,4)], diag=diag, zero.rm=FALSE)
t_value_ret <- mat_to_df(tvalue_mat, cnames=output_cols[c(1,2,5)], diag=diag, zero.rm=FALSE)
ret <- ret %>% dplyr::left_join(p_value_ret, by=output_cols[1:2]) # Join by pair.name.x and pair.name.y.
ret <- ret %>% dplyr::left_join(t_value_ret, by=output_cols[1:2]) # Join by pair.name.x and pair.name.y.
ret
}
#' @export
tidy.cor_exploratory <- function(x, type = "cor", ...) { #TODO: add test
if (type == "cor") {
x$cor
}
else {
x$data
}
}
#' Non standard evaluation version for do_cmdscale_
#' @return Tidy format of data frame.
#' @export
do_cmdscale <- function(df,
pair_name1,
pair_name2,
value,
...){
loadNamespace("dplyr")
loadNamespace("tidyr")
pair1_col <- col_name(substitute(pair_name1))
pair2_col <- col_name(substitute(pair_name2))
value_col <- col_name(substitute(value))
do_cmdscale_(df, pair1_col, pair2_col, value_col, ...)
}
#' Map dist result to k dimensions
#' @param df Data frame which has group and dimension
#' @return Tidy format of data frame.
#' @export
do_cmdscale_ <- function(df,
pair1_col,
pair2_col,
value_col,
k=2,
fun.aggregate=mean,
fill=0){
validate_empty_data(df)
loadNamespace("dplyr")
loadNamespace("tidyr")
grouped_col <- grouped_by(df)
# remove NA because cmdscale doesn't accept NA.
# NA happens if a data has NA in a value and
# all distances with others become NA too, so
# it should be totally removed from the data
df <- df %>%
dplyr::filter(!is.na(!!as.symbol(value_col)))
# if pair1_col and pair2_col are with the same name
# like aaa.x and aaa.y (output of skv of do_dist),
# it (in this case, "aaa") should be used as name column
# because the results are coordinates of them
# but pair.name.x and pair.name.y is from do_dist.cols and it isn't
# a valid name for name column because the output is no longer pair
# , so in that case, it will be just "name"
name <- stringr::str_replace(pair1_col, "\\.[x|y]$", "")
name_col <- if(name != "pair.name" && name == stringr::str_replace(pair2_col, "\\.[x|y]$", "")){
name
} else {
avoid_conflict(grouped_col, "name")
}
# this is executed on each group
do_cmdscale_each <- function(df){
if (all(df[[value_col]]==0)) {
# cmdscale() returns broken dataframe with only a column
# for the names of points and no columns for coordinate values,
# which would break processing after that.
# NAs are prefiltered at this point already.
# Inf can be handled within cmdscale().
stop("All distances are 0. Multidimensional scaling cannot be calculated.")
}
mat <- simple_cast(
df,
pair1_col,
pair2_col,
value_col,
fun.aggregate = fun.aggregate,
fill=fill,
na.rm = TRUE
)
cnames <- colnames(mat)
rnames <- rownames(mat)
if(any(cnames != rnames)){
diffcol <- setdiff(rnames, cnames)
diffrow <- setdiff(cnames, rnames)
if(!(length(diffcol)==1 & length(diffrow)==1)){
stop(paste("Can't create dist matrix from ", pair1_col, " and ", pair2_col), collapse=" ")
} else {
# Create diagonal elements to be recognized as dist matrix
mat <- cbind(matrix(0, nrow=nrow(mat), ncol=1, dimnames = list(NULL, diffcol)), mat)
mat <- rbind(mat, matrix(0, nrow=1, ncol=ncol(mat), dimnames = list(diffrow, NULL)))
}
}
if (ncol(mat) <= k) {
# this causes an error in cmdscale, so should be validated
stop("Number of unique points should be more than the number of dimensions")
}
points <- cmdscale(as.dist(t(mat)), eig=FALSE, k=k)
result_df <- as.data.frame(points)
# these column names should be consistent with the result of do_svd
colnames(result_df) <- paste("axis", seq(ncol(result_df)), sep = "")
df <- setNames(data.frame(rownames(points), stringsAsFactors=FALSE), name_col)
ret <- cbind(df, result_df)
ret
}
# Calculation is executed in each group.
# Storing the result in this tmp_col and
# unnesting the result.
# If the original data frame is grouped by "tmp",
# overwriting it should be avoided,
# so avoid_conflict is used here.
tmp_col <- avoid_conflict(grouped_col, "tmp")
df %>%
dplyr::do_(.dots=setNames(list(~do_cmdscale_each(.)), tmp_col)) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(tmp_col))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.