POMA EDA Example

Compiled date: r Sys.Date()

Last edited: 2020-08-03

License: r packageDescription("POMA")[["License"]]

knitr::opts_chunk$set(
  collapse = TRUE,
  # fig.align = "center",
  comment = ">"
)

Installation

Run the following code to install the Bioconductor version of package.

# install.packages("BiocManager")
BiocManager::install("POMA")

Load POMA

library(POMA)

Automatic EDA Report

The following function will return an Exploratory Data Analysis (EDA) PDF report. The input object must be an MSnSet object.

data("st000336")
PomaEDA(st000336)
library(patchwork)
library(knitr)
library(dplyr)
library(tibble)
library(ggplot2)
library(reshape2)
library(Biobase)

e <- t(Biobase::exprs(st000336))
target <- Biobase::pData(st000336) %>% rownames_to_column("ID") %>% rename(Group = 2) %>% select(ID, Group)
imputed <- PomaImpute(st000336, method = "knn")
pre_processed <- PomaNorm(imputed, method = "log_pareto") %>%
  PomaOutliers(coef = 1.5)
# zeros
zeros <- data.frame(number = colSums(e == 0, na.rm = TRUE)) %>%
  rownames_to_column("names") %>%
  filter(number != 0)

all_zero <- zeros %>% 
  filter(number == nrow(e))

# missing values
nas <- data.frame(number = colSums(is.na(e))) %>%
  rownames_to_column("names") %>%
  filter(number != 0)

# zero variance
var_zero <- e %>%
  as.data.frame() %>%
  summarise_all(~ var(., na.rm = TRUE)) %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column("names") %>%
  filter(V1 == 0)

Generated EDA PDF report starts here.

Know your data

Summary Table

summary_table1 <- data.frame(Samples = nrow(e),
                             Features = ncol(e),
                             Covariates = ncol(Biobase::pData(st000336)) - 1)
summary_table2 <- data.frame(Counts_Zero = sum(zeros$number),
                             Percent_Zero = paste(round((sum(zeros$number)/(nrow(e)*ncol(e)))*100, 2), "%"))
summary_table3 <- data.frame(Counts_NA = sum(is.na(e)),
                             Percent_NA = paste(round((sum(is.na(e))/(nrow(e)*ncol(e)))*100, 2), "%")) 
knitr::kable(summary_table1)
knitr::kable(summary_table2)
knitr::kable(summary_table3)
if (nrow(nas) >= 1){
  ggplot(nas, aes(reorder(names, number), number, fill = number)) +
    geom_col() +
    ylab("Missing values") +
    xlab("") +
    ggtitle("Missing Value Plot") +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none")
}
if (nrow(zeros) >= 1){
  ggplot(zeros, aes(reorder(names, number), number, fill = number)) +
    geom_col() +
    ylab("Zeros") +
    xlab("") +
    ggtitle("Zeros Plot") +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none")
}

Samples by Group

counts <- data.frame(table(target$Group))
colnames(counts) <- c("Group", "Counts")

ggplot(counts, aes(reorder(Group, Counts), Counts, fill = Group)) +
  geom_col() +
  ylab("Counts") +
  xlab("") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

Normalization Plots

indNum <- nrow(Biobase::pData(pre_processed))
jttr <- ifelse(indNum <= 10, TRUE, FALSE)

p1 <- PomaBoxplots(imputed, jitter = jttr) +
  xlab("Samples") +
  ylab("Value") +
  ggtitle("Not Normalized") +
  theme(legend.position = "bottom")

p2 <- PomaBoxplots(pre_processed, jitter = jttr) +
  xlab("Samples") +
  ylab("Value") +
  ggtitle("Normalized ('log_pareto')") +
  theme(legend.position = "bottom")

p1 + p2

Group Distribution Plots

p3 <- PomaDensity(imputed) +
  ggtitle("Not Normalized")

p4 <- PomaDensity(pre_processed) +
    ggtitle("Normalized ('log_pareto')")

p3/p4

Outlier Detection

outliers <- st000336 %>% 
  PomaImpute(method = "knn") %>%
  PomaNorm(method = "log_pareto") %>%
  PomaOutliers(do = "analyze", coef = 1.5)
outliers$polygon_plot

7 possible outliers detected in your data. These outliers are ‘DMD119.2.U02’, ‘DMD084.11.U02’, ‘DMD087.12.U02’, ‘DMD023.10.U02’, ‘DMD046.11.U02’, ‘DMD133.9.U02’, ‘DMD135.10.U02’.

if(nrow(outliers$outliers) >= 1){
  knitr::kable(outliers$outliers)
  }

High Correlated Features (r > 0.97)

correlations <- PomaCorr(pre_processed)
high_correlations <- correlations$correlations %>% filter(abs(corr) > 0.97)

There are 0 high correlated feature pairs in your data.

Heatmap and Clustering

PomaHeatmap(pre_processed)

Principal Component Analysis

PomaMultivariate(pre_processed, method = "pca")$scoresplot


Try the POMA package in your browser

Any scripts or data that you put into this service are public.

POMA documentation built on Nov. 8, 2020, 6:26 p.m.