vignettes/PracticalPCA.R

## ------------------------------------------------------------------------
library(ggplot2) 

## ------------------------------------------------------------------------
# Load the data 
data(iris)

# Size of the dataframe 
dim(iris)

# First few lines of the dataset 
head(iris) 

## ------------------------------------------------------------------------
# Summary statistics 
summary(iris) 

## ------------------------------------------------------------------------
pca_iris <- prcomp(iris[,-5]) 

## ------------------------------------------------------------------------
summary(pca_iris) 

## ------------------------------------------------------------------------
plot_iris <- data.frame(pca_iris$x, Species=iris$Species) 
head(plot_iris)

## ------------------------------------------------------------------------
ggplot(plot_iris, aes(x=PC1, y=PC2, color=Species)) + geom_point()  + coord_fixed()

## ------------------------------------------------------------------------
ggplot(plot_iris, aes(x=PC3, y=PC4, color=Species)) + geom_point() + coord_fixed()

## ------------------------------------------------------------------------
with(plot_iris, plot(x=PC1, y=PC2, col=Species)); grid() 

## ------------------------------------------------------------------------
# Load the data 
suppressPackageStartupMessages(library(pdfCluster))
data(oliveoil)

# Size of the dataframe 
dim(oliveoil)

# First few lines of the dataset 
head(oliveoil) 

## ------------------------------------------------------------------------
# Summary statistics 
summary(oliveoil) 

## ------------------------------------------------------------------------
# PCA without the first two columns
pca_oil <- prcomp(oliveoil[,-c(1,2)]) 

# Save the data for plotting
plot_oil <- data.frame(pca_oil$x, oliveoil[,1:2]) 

## ------------------------------------------------------------------------
ggplot(plot_oil, aes(x=PC1, y=PC2, color=region, shape=macro.area)) + geom_point() + coord_fixed() 

## ------------------------------------------------------------------------
library(Shenzen2016)
data("zebrafish")

## ------------------------------------------------------------------------
above_threshold <- rowSums(zebrafish$Expression >= 10) >= 3
EM_zebra <- subset(zebrafish$Expression, above_threshold)

## ------------------------------------------------------------------------
library(edgeR)

# Create dge list object
dge_zebra <- DGEList(EM_zebra)

# Calculate normalization factors
dge_zebra <- calcNormFactors(dge_zebra, method="TMM")

# Normalize the expression values
logTMM_zebra <- cpm(dge_zebra, log=TRUE, prior.count=3)

## ------------------------------------------------------------------------
pca_zebra <- prcomp(t(logTMM_zebra))
summary(pca_zebra)

## ------------------------------------------------------------------------
plot_zebra <- data.frame(pca_zebra$x, zebrafish$Design) 
head(plot_zebra) 

## ------------------------------------------------------------------------
ggplot(plot_zebra, aes(x=PC1, y=PC2, color=gallein)) + geom_point() + coord_fixed() 

## ------------------------------------------------------------------------
ggplot(plot_zebra, aes(x=PC3, y=PC4, color=gallein)) + geom_point() + coord_fixed()

## ------------------------------------------------------------------------
# Load the data
data("yeast")
data("pasilla")
data("tissues")

# Inspect the dimensionality
dim(yeast$Expression)

# Inspect the design by
yeast$Design

## ------------------------------------------------------------------------
# Set a dataset: either yeast, pasilla or tissues
dataset <- pasilla

# Trim: Play around with numbers here 
EM <- subset(dataset$Expression, rowSums(dataset$Expression >= 5) >= 3)

# Calculate normalization factors: Play around with the method argument (RLE, "none")
dge <- calcNormFactors(DGEList(EM), method="TMM")

# Normalize the expression values (Advanced, play around with prior.count)
logTMM <- cpm(dge, log=TRUE, prior.count=5)

# Perform the PCA and save data for plotting
pca <- prcomp(t(logTMM))
plot_data <- data.frame(pca$x, dataset$Design) 

## ------------------------------------------------------------------------
ggplot(plot_data, aes(x=PC1, y=PC2, color=condition, shape=type)) + geom_point() + coord_fixed() 
MalteThodberg/Shenzen2016 documentation built on May 7, 2019, 2:09 p.m.