library(dplyr)
library(data.table)
library(stats)
library(tibble)
library(tidyr)
library(ggplot2)
library(corrplot)
library(grDevices)
#' @title func
#' @description Returns list of Numeric and Character Columns
#' @param df
#' @return res
#' @export split.vectors
split.vectors <- function(df)
{
num_ind <- which(sapply(df,class) %in% c("integer","numeric","double"))
num.vect <- colnames(df)[num_ind]
char_ind <- which(sapply(df,class) %in% c("character","factor"))
char.vect <- colnames(df)[char_ind]
res <- list("Numeric_Columns" = num.vect,"Character_Columns" = char.vect)
return(res)
}
#' @title func
#' @description Detects the Numerical Columns in the DataFrame and provides the Summary Statistics
#' @param df
#' @return res
#' @export numeric_summary
numeric_summary <- function(df)
{
num_ind <- which(sapply(df,class) %in% c("integer","numeric","double"))
num_df <- subset.data.frame(df,select = names(df)[num_ind])
min <- sapply(num_df,min,na.rm = T)
max <- sapply(num_df,max,na.rm = T)
per.25 <- sapply(num_df,quantile,na.rm = T)[2,]
per.50 <- sapply(num_df,quantile,na.rm = T)[3,]
per.75 <- sapply(num_df,quantile,na.rm = T)[4,]
mean <- sapply(num_df,mean,na.rm = T)
median <- sapply(num_df,median,na.rm = T)
variance <- sapply(num_df,function(x){options(scipen = 10)
var(x,na.rm = T)})
sd <- sapply(num_df,sd,na.rm = T)
per.na <- round(colSums(is.na(num_df))/nrow(df),3)*100
count.na <- colSums(is.na(num_df))
per.out <- sapply(num_df,function(x){
round(length(boxplot.stats(x)$out)/nrow(num_df),3)*100})
res <- data.frame(min,max,per.25,per.50,per.75,mean,median,variance,sd,count.na,per.na,per.out)
res <- as.data.frame(t(res),stringsAsFactors = F)
res <- tibble::rownames_to_column(res,var = "STATISTIC")
return(res)
}
#' @title func
#' @description Detects the Character and Factor Columns in the DataFrame and provides the Summary Statistics
#' @param df
#' @return res
#' @export factor_summary
factor_summary <- function(df)
{
fact.char_ind <- which(sapply(df, class) %in% c("character","factor"))
fact_df <- subset(df, select = colnames(df)[fact.char_ind])
## Unique Levels
uniq.levels <- sapply(fact_df, function(x) {
length(levels(as.factor(x)))
})
## Mode of the Data Column
mode <- sapply(fact_df, function(x) {
names(table(x)[which.max(table(x))])
})
mode.freq <- sapply(fact_df, function(x) {
table(x)[which.max(table(x))]
})
## Percentage of Most repeating Level
per.levels <- sapply(fact_df, function(x) {
round(sum(cumsum(sort(table(x)/sum(table(x)) * 100, decreasing = T)) <
80)/length(levels(as.factor(x))) * 100, 3)
})
res <- data.frame(uniq.levels, mode, mode.freq, per.levels)
res <- as.data.frame(t(res), stringsAsFactors = F)
res <- tibble::rownames_to_column(res, var = "STATISTIC")
return(res)
}
#' @title func
#' @description Sturge Formula to convert the Numerical column into appropriate Class Intervals.
#' @param df
#' @return freq.table
#' @export factor_summary
sturge <- function (vect, bins = 0,cat.col = TRUE)
{
if (class(vect) == "character")
return(transform(table(vect)))
n <- length(vect)
if (n == 1)
return(table(vect))
low <- round(min(vect, na.rm = TRUE))
high <- round(max(vect, na.rm = TRUE))
k <- round(log2(n))
if (bins != 0) {
width <- round((high - low)/(bins - 1))
}else {
width <- round((high - low)/k)
}
if (width > 0) {
bins <- seq(low, high + width, width)
}else {
stop("Width of Interval is not Correct or NA values in DataSet")
}
interval <- cut(vect, bins, dig.lab = 5)
freq.table <- transform(table(interval))
ls = list("Freq.Table" = freq.table,"Vect" = interval)
return(ls)
}
#' @title func
#' @description Detects the Numerical Columns and performs Pairwise t-test for all Numerical Columns in DataFrame
#' @param df
#' @return res
#' @export factor_summary
DF.ttest <- function(data,num_cols = c())
{
require(plyr,quietly = T)
data <- data.frame(data)
if(length(num_cols) > 0)
{
num_cols <- names(data)[num_cols]
}else{
num_cols <- split.vectors(data)$Numeric_Columns
}
data <- data[,num_cols]
combos <- combn(ncol(data),2)
res <- data.frame()
for(i in 1:ncol(combos))
{
result <- t.test(data[,combos[1,i]],data[,combos[2,i]])
df <- data.frame("Column_1" = colnames(data)[combos[1,i]],
"Column_1.Mean" = result$estimate[1],
"Column_2" = colnames(data[combos[2,i]]),
"Column_2.Mean" = result$estimate[2],
"t_Value" = as.numeric(sprintf("%.3f", result$statistic)),
"Deg_of_Freedom"= result$parameter,
"p_value" = as.numeric(sprintf("%.3f", result$p.value)))
res <- rbind(res,df)
}
res <- res[order(res[,"p_value"],decreasing = T),]
row.names(res) <- 1:nrow(res)
return(res)
}
#' @title func
#' @description Returns the most repeating value in the Vector
#' @param vect
#' @return ind
#' @export mode
mode <- function(vect)
{
`%>%` <- dplyr::`%>%`
vect <- vect[!is.na(vect)]
ind <- which(table(vect) == max(table(vect)))
ind <- names(ind) %>% as.numeric()
if(length(ind) > 1){
if(sum(mean(vect) == ind) > 0)return(mean(vect))
return(sort(ind)[round(length(ind)/2)])
}else
{
return(ind)
}
}
#' @title func
#' @description Gives the Distribution Plots for all Numeric Columns in the DataFrame
#' @param df
#' @return plot
#' @export Numeric.Dist
Numeric.Dist <- function(df)
{
`%>%` <- dplyr::`%>%`
num_cols <- split.vectors(df)$Numeric_Columns
if(length(num_cols)== 0)stop("Error: No Numeric Columns present in Dataset!")
num_df <- df[,num_cols] %>% as.data.frame()
names(num_df) <- num_cols
numeric_df <- num_df %>% tidyr::gather(key,value)
mean.df <- data.frame("key" = unique(numeric_df$key),"Mean" = sapply(num_df,mean,na.rm = T),"Label" = rep("Mu",ncol(num_df)))
median.df <- data.frame("key" = unique(numeric_df$key),"Median" = sapply(num_df,median,na.rm = T),"Label" = rep("Med",ncol(num_df)))
mode.df <- data.frame("key" = unique(numeric_df$key),"Mode" = sapply(num_df,mode),"Label" = rep("Mod",ncol(num_df)))
numeric_df %>% ggplot2::ggplot(ggplot2::aes(value)) + ggplot2::geom_density(ggplot2::aes(y = ..count..),fill = "steelblue4",size = 1) +
ggplot2::geom_vline(data = mean.df,ggplot2::aes(xintercept = Mean),linetype = "dashed",color = "red",size = 0.8) +
ggplot2::geom_vline(data = median.df,ggplot2::aes(xintercept = Median),linetype = "dashed",color = "green1",size = 0.8) +
ggplot2::geom_vline(data = mode.df,ggplot2::aes(xintercept = Mode),linetype = "dashed",color = "yellow2",size = 0.8) +
ggplot2::facet_wrap(~key,scales = "free") + ggplot2::geom_text(data = mean.df,ggplot2::aes(x = Mean,label = Label, y = 0.2),inherit.aes = F,color = "red")+
ggplot2::geom_text(data = median.df,ggplot2::aes(x = Median,label = Label, y = 0.4),inherit.aes = F,color = "green1") +
ggplot2::geom_text(data = mode.df,ggplot2::aes(x = Mode,label = Label, y = 0.6),inherit.aes = F,color = "yellow2") + ggplot2::theme_bw()
}
#' @title func
#' @description Gives the Distribution Plots for all Character Columns in the DataFrame
#' @param df
#' @return plot
#' @export Character.Dist
Character.Dist <- function(df)
{
`%>%` <- dplyr::`%>%`
char_cols <- split.vectors(df)$Character_Columns
if(length(char_cols)== 0)stop("Error: No Character or Factor Columns present in Dataset!")
char_df <- df[,char_cols] %>% as.data.frame()
names(char_df) <- char_cols
character_df <- char_df %>% tidyr::gather(key,value)
character_df <- character_df %>% dplyr::group_by(key,value) %>% dplyr::summarise("Count" = n())
ggplot2::ggplot(character_df,ggplot2::aes(x = stats::reorder(value,Count), y = Count,fill = value)) + ggplot2::geom_bar(stat = "identity") + ggplot2::facet_wrap(~key,scales = "free")+ ggplot2::coord_flip() + ggplot2::guides(fill = F)
}
#' @title func
#' @description Gives the pariwise annova test results for all numerical and categorical columns in the Dataset
#' @param df
#' @return res
#' @export DF.aov_ttest
DF.aov_ttest <- function(df)
{
`%>%` <- dplyr::`%>%`
df <- as.data.frame(df)
n <- split.vectors(df)$Numeric_Columns
if(length(n) == 0)stop("No Numeric Columns present in the Dataset")
c <- split.vectors(df)$Character_Columns
if(length(c) == 0)stop("No Character Columns present in the Dataset")
num_df <- df[,n] %>% as.data.frame()
names(num_df) <- n
char_df <- df[,c] %>% as.data.frame()
names(char_df) <- c
char_cols <- c()
num_cols <- c()
p.value <- c()
for(i in 1:ncol(char_df))
{
for(j in 1:ncol(num_df))
{
if(length(levels(as.factor(char_df[,i]))) <= 2)
{
char_cols <- c(char_cols,names(char_df)[i])
num_cols <- c(num_cols,names(num_df)[j])
p <- round(t.test(num_df[char_df[,i] == levels(as.factor(char_df[,i]))[1],j],num_df[char_df[,i] == levels(as.factor(char_df[,i]))[2],j])$p.value,4)
p.value <- c(p.value,p)
}else{
char_cols <- c(char_cols,names(char_df)[i])
num_cols <- c(num_cols,names(num_df)[j])
r <- summary(aov(num_df[,j] ~ char_df[,i]))
p <- ifelse(length(r[[1]]) < 5,0,round(r[[1]][5]$`Pr(>F)`[1],4))
p.value <- c(p.value,p)
}
}
}
res <- data.frame("Character_Column" = char_cols, "Numeric_Column" = num_cols, "P_Value" = p.value)
res <- res %>% dplyr::arrange(-P_Value)
return(res)
}
#' @title func
#' @description Gives the correlation between numerical variables in the Data Set
#' @param df
#' @return res
#' @export DF.corr_plot
DF.corr_plot <- function(df)
{
`%>%` <- dplyr::`%>%`
df <- na.omit(df)
num_df <- df %>% dplyr::select(split.vectors(df)$Numeric_Columns) %>% as.data.frame()
if(ncol(num_df) <= 1)
stop("In Sufficient Numeric Columns in Data Frame")
cor.mtest <- function(mat) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j])
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
p.mat <- cor.mtest(num_df)
corr_data <- stats::cor(num_df)
col <- grDevices::colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot::corrplot(corr_data, method="color", col=col(200),
type="upper", order="hclust",
addCoef.col = "black", # Add coefficient of correlation
# Combine with significance
p.mat = p.mat, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.