POMA `r paste0('(', packageVersion('POMA'), ')')`: Exploratory Data Analysis Report

# This file is part of POMA.

# POMA is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# POMA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with POMA. If not, see <https://www.gnu.org/licenses/>.

library(patchwork)
library(knitr)
library(dplyr)
library(tibble)
library(ggplot2)
library(reshape2)
library(Biobase)
library(POMA)

e <- t(Biobase::exprs(data))
target <- Biobase::pData(data) %>% rownames_to_column("ID") %>% rename(Group = 2) %>% select(ID, Group)

\pagebreak

if(clean_outliers){

  imputed <- PomaImpute(data, method = imputation)
  pre_processed <- PomaNorm(imputed, method = normalization) %>%
    PomaOutliers(coef = coeff_outliers)

} else {

    imputed <- PomaImpute(data, method = imputation)
    pre_processed <- PomaNorm(imputed, method = normalization)

}
# zeros
zeros <- data.frame(number = colSums(e == 0, na.rm = TRUE)) %>%
  rownames_to_column("names") %>%
  filter(number != 0)

all_zero <- zeros %>% 
  filter(number == nrow(e))

# missing values
nas <- data.frame(number = colSums(is.na(e))) %>%
  rownames_to_column("names") %>%
  filter(number != 0)

# zero variance
var_zero <- e %>%
  as.data.frame() %>%
  summarise_all(~ var(., na.rm = TRUE)) %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column("names") %>%
  filter(V1 == 0)

Know your data

Summary Table

summary_table1 <- data.frame(Samples = nrow(e),
                             Features = ncol(e),
                             Covariates = ncol(Biobase::pData(data)) - 1)
summary_table2 <- data.frame(Counts_Zero = sum(zeros$number),
                             Percent_Zero = paste(round((sum(zeros$number)/(nrow(e)*ncol(e)))*100, 2), "%"))
summary_table3 <- data.frame(Counts_NA = sum(is.na(e)),
                             Percent_NA = paste(round((sum(is.na(e))/(nrow(e)*ncol(e)))*100, 2), "%")) 
knitr::kable(summary_table1)
knitr::kable(summary_table2)
knitr::kable(summary_table3)
if (nrow(nas) >= 1){
  ggplot(nas, aes(reorder(names, number), number, fill = number)) +
    geom_col() +
    ylab("Missing values") +
    xlab("") +
    ggtitle("Missing Value Plot") +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none")
}
if (nrow(zeros) >= 1){
  ggplot(zeros, aes(reorder(names, number), number, fill = number)) +
    geom_col() +
    ylab("Zeros") +
    xlab("") +
    ggtitle("Zeros Plot") +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none")
}

Samples by Group

counts <- data.frame(table(target$Group))
colnames(counts) <- c("Group", "Counts")

ggplot(counts, aes(reorder(Group, Counts), Counts, fill = Group)) +
  geom_col() +
  ylab("Counts") +
  xlab("") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

Normalization Plots

indNum <- nrow(Biobase::pData(pre_processed))
jttr <- ifelse(indNum <= 10, TRUE, FALSE)

p1 <- PomaBoxplots(imputed, jitter = jttr) +
  xlab("Samples") +
  ylab("Value") +
  ggtitle("Not Normalized") +
  theme(legend.position = "bottom")

p2 <- PomaBoxplots(pre_processed, jitter = jttr) +
  xlab("Samples") +
  ylab("Value") +
  ggtitle(paste0("Normalized (", normalization, ")")) +
  theme(legend.position = "bottom")

p1 + p2

Group Distribution Plots

p3 <- PomaDensity(imputed) +
  ggtitle("Not Normalized")

p4 <- PomaDensity(pre_processed) +
  ggtitle(paste0("Normalized (", normalization, ")"))

p3/p4

Outlier Detection

outliers <- data %>% 
  PomaImpute(method = imputation) %>%
  PomaNorm(method = normalization) %>%
  PomaOutliers(do = "analyze", coef = coeff_outliers)
outliers$polygon_plot

r nrow(outliers$outliers) possible outliers detected in your data. r ifelse(nrow(outliers$outliers) >= 1, paste0("These outliers are **",noquote(paste(shQuote(paste0(outliers$outliers$sample)), collapse=", ")),"**."), "")

if(nrow(outliers$outliers) >= 1){
  knitr::kable(outliers$outliers)
  }

High Correlated Features (r > 0.97)

correlations <- PomaCorr(pre_processed)
high_correlations <- correlations$correlations %>% filter(abs(corr) > 0.97)

There are r nrow(high_correlations) high correlated feature pairs in your data. r ifelse(nrow(high_correlations) >= 1, paste0("These features are **",noquote(paste(shQuote(paste0(high_correlations$Var1, " - " , high_correlations$Var2)), collapse=", ")),"**."), "")

Heatmap and Clustering

PomaHeatmap(pre_processed)

Principal Component Analysis

PomaMultivariate(pre_processed, method = "pca")$scoresplot


Try the POMA package in your browser

Any scripts or data that you put into this service are public.

POMA documentation built on Nov. 8, 2020, 6:26 p.m.